mirror of
https://github.com/MarshalX/telegram-crawler.git
synced 2025-01-11 12:41:37 +01:00
reorganise structure of data folder; separate the "making files tree" workflow on 2 part (should be separated by 4 later)
This commit is contained in:
parent
b9fc0ea79f
commit
9cab070822
5 changed files with 1636 additions and 1564 deletions
28
.github/workflows/make_files_tree.yml
vendored
28
.github/workflows/make_files_tree.yml
vendored
|
@ -12,8 +12,17 @@ jobs:
|
|||
fetch_new_content:
|
||||
name: Make files tree
|
||||
runs-on: macos-10.15
|
||||
continue-on-error: true
|
||||
timeout-minutes: 10
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
mode: [
|
||||
"0",
|
||||
"6"
|
||||
]
|
||||
|
||||
steps:
|
||||
|
||||
- name: Clone.
|
||||
|
@ -37,18 +46,35 @@ jobs:
|
|||
TELEGRAM_SESSION_TEST: ${{ secrets.TELEGRAM_SESSION_TEST }}
|
||||
TELEGRAM_API_ID: ${{ secrets.TELEGRAM_API_ID }}
|
||||
TELEGRAM_API_HASH: ${{ secrets.TELEGRAM_API_HASH }}
|
||||
MODE: ${{ matrix.mode }}
|
||||
run: |
|
||||
git pull
|
||||
python make_files_tree.py
|
||||
rm -rf __pycache__
|
||||
|
||||
- name: Commit and push changes.
|
||||
- name: Prepare data for mode 0.
|
||||
if: matrix.mode == '0'
|
||||
run: |
|
||||
git checkout data
|
||||
|
||||
mv data/web_res data_ci/web_res
|
||||
rm -rf data
|
||||
mv data_ci data
|
||||
|
||||
- name: Prepare data for mode 6.
|
||||
if: matrix.mode == '6'
|
||||
run: |
|
||||
git checkout data
|
||||
|
||||
mv data/web data_ci/web
|
||||
mv data/client data_ci/client
|
||||
mv data/server data_ci/server
|
||||
|
||||
rm -rf data
|
||||
mv data_ci data
|
||||
|
||||
- name: Commit and push changes.
|
||||
run: |
|
||||
git config --global user.email "github-action@users.noreply.github.com"
|
||||
git config --global user.name "GitHub Action"
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import asyncio
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
|
@ -7,7 +8,6 @@ import re
|
|||
import shutil
|
||||
import sys
|
||||
import zipfile
|
||||
import hashlib
|
||||
from asyncio.exceptions import TimeoutError
|
||||
from string import punctuation, whitespace
|
||||
from time import time
|
||||
|
@ -25,7 +25,12 @@ ILLEGAL_PATH_CHARS = punctuation.replace('.', '') + whitespace
|
|||
DYNAMIC_PART_MOCK = 'telegram-crawler'
|
||||
|
||||
INPUT_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_links.txt')
|
||||
INPUT_RES_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_res_links.txt')
|
||||
OUTPUT_FOLDER = os.environ.get('OUTPUT_FOLDER', 'data/')
|
||||
OUTPUT_MTPROTO_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_MTPROTO_FOLDER', 'server/'))
|
||||
OUTPUT_SITES_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_SITES_FOLDER', 'web/'))
|
||||
OUTPUT_CLIENTS_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_CLIENTS_FOLDER', 'client/'))
|
||||
OUTPUT_RESOURCES_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_RESOURCES_FOLDER', 'web_res/'))
|
||||
|
||||
TRANSLATIONS_EN_CATEGORY_URL_REGEX = r'/en/[a-z_]+/[a-z_]+/$'
|
||||
|
||||
|
@ -44,7 +49,7 @@ SPARKLE_SE_REGEX = r';se=(.*?);'
|
|||
SPARKLE_SIG_TEMPLATE = f';sig={DYNAMIC_PART_MOCK};'
|
||||
SPARKLE_SE_TEMPLATE = f';se={DYNAMIC_PART_MOCK};'
|
||||
|
||||
stel_dev_layer = 190
|
||||
STEL_DEV_LAYER = 190
|
||||
|
||||
# unsecure but so simple
|
||||
CONNECTOR = aiohttp.TCPConnector(ssl=False, force_close=True, limit=300)
|
||||
|
@ -56,7 +61,7 @@ HEADERS = {
|
|||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'DNT': '1',
|
||||
'Connection': 'keep-alive',
|
||||
'Cookie': f'stel_ln=en; stel_dev_layer={stel_dev_layer}',
|
||||
'Cookie': f'stel_ln=en; stel_dev_layer={STEL_DEV_LAYER}',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
|
@ -127,7 +132,7 @@ async def track_additional_files(
|
|||
content = re.sub(r'id=".*"', 'id="tgcrawl"', content)
|
||||
content = re.sub(r'name="APKTOOL_DUMMY_.*" id', 'name="tgcrawl" id', content)
|
||||
|
||||
filename = os.path.join(OUTPUT_FOLDER, output_dir_name, file)
|
||||
filename = os.path.join(output_dir_name, file)
|
||||
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
||||
async with aiofiles.open(filename, 'w', encoding='utf-8') as w_file:
|
||||
await w_file.write(content)
|
||||
|
@ -140,7 +145,7 @@ async def download_telegram_macos_beta_and_extract_resources(session: aiohttp.Cl
|
|||
if not download_url:
|
||||
return
|
||||
|
||||
crawled_data_folder = 'telegram-beta-macos'
|
||||
crawled_data_folder = os.path.join(OUTPUT_CLIENTS_FOLDER, 'macos-beta')
|
||||
client_folder_name = 'macos'
|
||||
client_archive_name = 'macos.zip'
|
||||
|
||||
|
@ -219,7 +224,7 @@ async def download_telegram_ios_beta_and_extract_resources(session: aiohttp.Clie
|
|||
assets_filename = 'Assets.car'
|
||||
assets_output_dir = 'ios_assets'
|
||||
client_folder_name = 'ios'
|
||||
crawled_data_folder = 'telegram-beta-ios'
|
||||
crawled_data_folder = os.path.join(OUTPUT_CLIENTS_FOLDER, 'ios-beta')
|
||||
|
||||
if 'darwin' not in platform.system().lower():
|
||||
await download_file(download_url, ipa_filename, session)
|
||||
|
@ -304,6 +309,8 @@ async def download_telegram_android_beta_and_extract_resources(session: aiohttp.
|
|||
parameterized_url = 'apps/drklo-2kb-ghpo/telegram-beta-2/distribution_groups/all-users-of-telegram-beta-2'
|
||||
download_url = await get_download_link_of_latest_appcenter_release(parameterized_url, session)
|
||||
|
||||
crawled_data_folder = os.path.join(OUTPUT_CLIENTS_FOLDER, 'android-beta')
|
||||
|
||||
if not download_url:
|
||||
return
|
||||
|
||||
|
@ -328,7 +335,7 @@ async def download_telegram_android_beta_and_extract_resources(session: aiohttp.
|
|||
'res/values/strings.xml',
|
||||
'res/values/public.xml'
|
||||
]
|
||||
await track_additional_files(files_to_track, 'android', 'telegram-beta-android')
|
||||
await track_additional_files(files_to_track, 'android', crawled_data_folder)
|
||||
|
||||
cleanup()
|
||||
|
||||
|
@ -428,9 +435,8 @@ async def _fetch_and_track_mtproto(app, output_dir):
|
|||
configs['GetConfig'].expires = 0
|
||||
configs['GetConfig'].dc_options = []
|
||||
|
||||
output_dir_name = os.path.join('telegram-mtproto', output_dir)
|
||||
for file, content in configs.items():
|
||||
filename = os.path.join(OUTPUT_FOLDER, output_dir_name, f'{file}.json')
|
||||
filename = os.path.join(OUTPUT_MTPROTO_FOLDER, output_dir, f'{file}.json')
|
||||
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
||||
async with aiofiles.open(filename, 'w', encoding='utf-8') as w_file:
|
||||
await w_file.write(str(content))
|
||||
|
@ -460,20 +466,20 @@ class RetryError(Exception):
|
|||
...
|
||||
|
||||
|
||||
async def crawl(url: str, session: aiohttp.ClientSession):
|
||||
ok = False
|
||||
while not ok:
|
||||
async def crawl(url: str, session: aiohttp.ClientSession, output_dir: str = OUTPUT_SITES_FOLDER):
|
||||
while True:
|
||||
try:
|
||||
await _crawl(url, session)
|
||||
ok = True
|
||||
await _crawl(url, session, output_dir)
|
||||
except (RetryError, ServerDisconnectedError, TimeoutError, ClientConnectorError):
|
||||
logger.warning(f'Client or timeout error. Retrying {url}')
|
||||
else:
|
||||
break
|
||||
|
||||
|
||||
async def _crawl(url: str, session: aiohttp.ClientSession):
|
||||
async def _crawl(url: str, session: aiohttp.ClientSession, output_dir: str):
|
||||
logger.info(f'Process {url}')
|
||||
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT, headers=HEADERS) as response:
|
||||
if response.status // 100 == 5:
|
||||
if 499 < response.status < 600:
|
||||
msg = f'Error 5XX. Retrying {url}'
|
||||
logger.warning(msg)
|
||||
raise RetryError(msg)
|
||||
|
@ -502,7 +508,7 @@ async def _crawl(url: str, session: aiohttp.ClientSession):
|
|||
if is_hashable_only or is_sucking_file:
|
||||
ext = ''
|
||||
|
||||
filename = OUTPUT_FOLDER + '/'.join(url_parts) + ext
|
||||
filename = os.path.join(output_dir, *url_parts) + ext
|
||||
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
||||
|
||||
if is_sucking_file or is_hashable_only:
|
||||
|
@ -529,29 +535,44 @@ async def _crawl(url: str, session: aiohttp.ClientSession):
|
|||
await f.write(content)
|
||||
|
||||
|
||||
async def start(url_list: set[str], mode: int):
|
||||
async def crawl_web(session: aiohttp.ClientSession):
|
||||
with open(INPUT_FILENAME, 'r') as f:
|
||||
tracked_urls = set([l.replace('\n', '') for l in f.readlines()])
|
||||
|
||||
await asyncio.gather(*[crawl(url, session) for url in tracked_urls])
|
||||
|
||||
|
||||
async def crawl_web_res(session: aiohttp.ClientSession):
|
||||
with open(INPUT_RES_FILENAME, 'r') as f:
|
||||
tracked_urls = set([l.replace('\n', '') for l in f.readlines()])
|
||||
|
||||
await asyncio.gather(*[crawl(url, session, OUTPUT_RESOURCES_FOLDER) for url in tracked_urls])
|
||||
|
||||
|
||||
async def start(mode: int):
|
||||
async with aiohttp.ClientSession(connector=CONNECTOR) as session:
|
||||
# all without web resources
|
||||
mode == 0 and await asyncio.gather(
|
||||
*[crawl(url, session) for url in url_list],
|
||||
crawl_web(session),
|
||||
download_telegram_android_beta_and_extract_resources(session),
|
||||
download_telegram_macos_beta_and_extract_resources(session),
|
||||
# track_mtproto_configs(),
|
||||
track_mtproto_configs(),
|
||||
download_telegram_ios_beta_and_extract_resources(session),
|
||||
)
|
||||
mode == 1 and await asyncio.gather(*[crawl(url, session) for url in url_list])
|
||||
mode == 1 and await crawl_web(session)
|
||||
mode == 2 and await download_telegram_android_beta_and_extract_resources(session)
|
||||
mode == 3 and await download_telegram_macos_beta_and_extract_resources(session)
|
||||
mode == 4 and await track_mtproto_configs()
|
||||
mode == 5 and await download_telegram_ios_beta_and_extract_resources(session)
|
||||
mode == 6 and await crawl_web_res(session)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
run_mode = int(sys.argv[1]) if len(sys.argv) > 1 else 0
|
||||
|
||||
with open(INPUT_FILENAME, 'r') as f:
|
||||
tracked_urls = set([l.replace('\n', '') for l in f.readlines()])
|
||||
if 'MODE' in os.environ:
|
||||
run_mode = int(os.environ['MODE'])
|
||||
|
||||
start_time = time()
|
||||
logger.info(f'Start crawling content of {len(tracked_urls)} tracked urls...')
|
||||
asyncio.get_event_loop().run_until_complete(start(tracked_urls, run_mode))
|
||||
logger.info(f'Start crawling content of tracked urls...')
|
||||
asyncio.get_event_loop().run_until_complete(start(run_mode))
|
||||
logger.info(f'Stop crawling content in mode {run_mode}. {time() - start_time} sec.')
|
||||
|
|
|
@ -135,9 +135,9 @@ RELATIVE_JS_SCRIPTS_REGEX = r'["\'](.*\.js)["\'\?]'
|
|||
DOM_ATTRS = ['href', 'src']
|
||||
|
||||
OUTPUT_FILENAME = os.environ.get('OUTPUT_FILENAME', 'tracked_links.txt')
|
||||
COMPARE_OUTPUT_WITH_FILENAME = os.environ.get('COMPARE_OUTPUT_WITH_FILENAME', OUTPUT_FILENAME)
|
||||
OUTPUT_RESOURCES_FILENAME = os.environ.get('OUTPUT_RESOURCES_FILENAME', 'tracked_res_links.txt')
|
||||
|
||||
stel_dev_layer = 190
|
||||
STEL_DEV_LAYER = 190
|
||||
|
||||
# unsecure but so simple
|
||||
CONNECTOR = aiohttp.TCPConnector(ssl=False, force_close=True, limit=300)
|
||||
|
@ -149,7 +149,7 @@ HEADERS = {
|
|||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'DNT': '1',
|
||||
'Connection': 'keep-alive',
|
||||
'Cookie': f'stel_ln=en; stel_dev_layer={stel_dev_layer}',
|
||||
'Cookie': f'stel_ln=en; stel_dev_layer={STEL_DEV_LAYER}',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
|
@ -164,6 +164,7 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
VISITED_LINKS = set()
|
||||
LINKS_TO_TRACK = set()
|
||||
LINKS_TO_TRACKABLE_RESOURCES = set()
|
||||
|
||||
|
||||
def should_exclude(url: str) -> bool:
|
||||
|
@ -270,11 +271,28 @@ def cleanup_links(links: set[str]) -> set[str]:
|
|||
return cleaned_links
|
||||
|
||||
|
||||
def is_trackable_content_type(content_type) -> bool:
|
||||
trackable_content_types = (
|
||||
'css',
|
||||
def _is_x_content_type(content_types_set: set[str], content_type) -> bool:
|
||||
for match_content_type in content_types_set:
|
||||
if match_content_type in content_type:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def is_textable_content_type(content_type: str) -> bool:
|
||||
textable_content_type = {
|
||||
'plain',
|
||||
'css',
|
||||
'json',
|
||||
'text',
|
||||
'javascript',
|
||||
}
|
||||
|
||||
return _is_x_content_type(textable_content_type, content_type)
|
||||
|
||||
|
||||
def is_trackable_content_type(content_type) -> bool:
|
||||
trackable_content_types = {
|
||||
'svg',
|
||||
'png',
|
||||
'jpeg',
|
||||
|
@ -284,13 +302,9 @@ def is_trackable_content_type(content_type) -> bool:
|
|||
'webm',
|
||||
'application/octet-stream', # td updates
|
||||
'application/zip',
|
||||
)
|
||||
}
|
||||
|
||||
for trackable_content_type in trackable_content_types:
|
||||
if trackable_content_type in content_type:
|
||||
return True
|
||||
|
||||
return False
|
||||
return _is_x_content_type(trackable_content_types, content_type)
|
||||
|
||||
|
||||
class ServerSideError(Exception):
|
||||
|
@ -298,14 +312,14 @@ class ServerSideError(Exception):
|
|||
|
||||
|
||||
async def crawl(url: str, session: aiohttp.ClientSession):
|
||||
ok = False
|
||||
while not ok:
|
||||
while True:
|
||||
try:
|
||||
await _crawl(url, session)
|
||||
ok = True
|
||||
except (ServerSideError, ServerDisconnectedError, TimeoutError, ClientConnectorError):
|
||||
logger.warning(f'Client or timeout error. Retrying {url}')
|
||||
VISITED_LINKS.remove(url)
|
||||
else:
|
||||
break
|
||||
|
||||
|
||||
async def _crawl(url: str, session: aiohttp.ClientSession):
|
||||
|
@ -318,7 +332,7 @@ async def _crawl(url: str, session: aiohttp.ClientSession):
|
|||
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response:
|
||||
content_type = response.headers.get('content-type')
|
||||
|
||||
if response.status // 100 == 5:
|
||||
if 499 < response.status < 600:
|
||||
VISITED_LINKS.remove(url)
|
||||
logger.warning(f'Error 5XX. Retrying {url}')
|
||||
raise ServerSideError()
|
||||
|
@ -329,9 +343,11 @@ async def _crawl(url: str, session: aiohttp.ClientSession):
|
|||
logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}')
|
||||
return
|
||||
|
||||
if 'text' in content_type or 'javascript' in content_type:
|
||||
if is_textable_content_type(content_type):
|
||||
LINKS_TO_TRACK.add(url)
|
||||
|
||||
# raw content will be cached by aiohttp. Don't worry about it
|
||||
raw_content = await response.read()
|
||||
content = await response.text(encoding='UTF-8')
|
||||
absolute_links = cleanup_links(find_absolute_links(content))
|
||||
|
||||
|
@ -344,7 +360,7 @@ async def _crawl(url: str, session: aiohttp.ClientSession):
|
|||
sub_links = absolute_links | relative_links
|
||||
await asyncio.gather(*[crawl(url, session) for url in sub_links])
|
||||
elif is_trackable_content_type(content_type):
|
||||
LINKS_TO_TRACK.add(url)
|
||||
LINKS_TO_TRACKABLE_RESOURCES.add(url)
|
||||
else:
|
||||
# for example, zip with update of macOS client
|
||||
logger.info(f'Unhandled type: {content_type} from {url}')
|
||||
|
@ -358,8 +374,10 @@ async def _crawl(url: str, session: aiohttp.ClientSession):
|
|||
LINKS_TO_TRACK.remove(f'{without_trailing_slash}/')
|
||||
except UnicodeDecodeError:
|
||||
logger.warning(f'Codec can\'t decode bytes. So it was a tgs file or response with broken content type {url}')
|
||||
# except ClientConnectorError:
|
||||
# logger.warning(f'Wrong link: {url}')
|
||||
|
||||
if raw_content.startswith(b'GIF89a'):
|
||||
LINKS_TO_TRACK.remove(url)
|
||||
LINKS_TO_TRACKABLE_RESOURCES.add(url)
|
||||
|
||||
|
||||
async def start(url_list: set[str]):
|
||||
|
@ -377,14 +395,21 @@ if __name__ == '__main__':
|
|||
logger.info(f'Stop crawling links. {time() - start_time} sec.')
|
||||
|
||||
try:
|
||||
with open(COMPARE_OUTPUT_WITH_FILENAME, 'r') as f:
|
||||
OLD_URL_LIST = set([l.replace('\n', '') for l in f.readlines()])
|
||||
OLD_URL_LIST = set()
|
||||
for filename in (OUTPUT_FILENAME, OUTPUT_RESOURCES_FILENAME):
|
||||
with open(filename, 'r') as f:
|
||||
OLD_URL_LIST |= set([l.replace('\n', '') for l in f.readlines()])
|
||||
|
||||
logger.info(f'Is equal: {OLD_URL_LIST == LINKS_TO_TRACK}')
|
||||
logger.info(f'Deleted: {OLD_URL_LIST - LINKS_TO_TRACK}')
|
||||
logger.info(f'Added: {LINKS_TO_TRACK - OLD_URL_LIST}')
|
||||
CURRENT_URL_LIST = LINKS_TO_TRACK | LINKS_TO_TRACKABLE_RESOURCES
|
||||
|
||||
logger.info(f'Is equal: {OLD_URL_LIST == CURRENT_URL_LIST}')
|
||||
logger.info(f'Deleted: {OLD_URL_LIST - CURRENT_URL_LIST}')
|
||||
logger.info(f'Added: {CURRENT_URL_LIST - OLD_URL_LIST}')
|
||||
except IOError:
|
||||
pass
|
||||
|
||||
with open(OUTPUT_FILENAME, 'w') as f:
|
||||
f.write('\n'.join(sorted(LINKS_TO_TRACK)))
|
||||
|
||||
with open(OUTPUT_RESOURCES_FILENAME, 'w') as f:
|
||||
f.write('\n'.join(sorted(LINKS_TO_TRACKABLE_RESOURCES)))
|
||||
|
|
1512
tracked_links.txt
1512
tracked_links.txt
File diff suppressed because it is too large
Load diff
1512
tracked_res_links.txt
Normal file
1512
tracked_res_links.txt
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue