telegram-crawler/make_files_tree.py

import asyncio
import logging
import os
import re
import shutil
from asyncio.exceptions import TimeoutError
from string import punctuation, whitespace
from time import time

import aiofiles
import aiohttp
from aiohttp import ClientConnectorError

PROTOCOL = 'https://'
ILLEGAL_PATH_CHARS = punctuation.replace('.', '') + whitespace

DYNAMIC_PART_MOCK = 'telegram-crawler'

INPUT_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_links.txt')
OUTPUT_FOLDER = os.environ.get('OUTPUT_FOLDER', 'data/')

TRANSLATIONS_EN_CATEGORY_URL_REGEX = r'/en/[a-z_]+/[a-z_]+/$'

PAGE_GENERATION_TIME_REGEX = r'<!-- page generated in .+ -->'
PAGE_API_HASH_REGEX = r'\?hash=[a-z0-9]+'
PAGE_API_HASH_TEMPLATE = f'?hash={DYNAMIC_PART_MOCK}'
PASSPORT_SSID_REGEX = r'passport_ssid=[a-z0-9]+_[a-z0-9]+_[a-z0-9]+'
PASSPORT_SSID_TEMPLATE = f'passport_ssid={DYNAMIC_PART_MOCK}'
NONCE_REGEX = r'"nonce":"[a-z0-9]+_[a-z0-9]+_[a-z0-9]+'
NONCE_TEMPLATE = f'"nonce":"{DYNAMIC_PART_MOCK}'
PROXY_CONFIG_SUB_NET_REGEX = r'\d+\.\d+:8888;'
PROXY_CONFIG_SUB_NET_TEMPLATE = 'X.X:8888;'
TRANSLATE_SUGGESTION_REGEX = r'<div class="tr-value-suggestion">(.?)+</div>'

# unsecure but so simple
CONNECTOR = aiohttp.TCPConnector(ssl=False)
TIMEOUT = aiohttp.ClientTimeout(total=10)

logging.basicConfig(format='%(message)s', level=logging.DEBUG)
logger = logging.getLogger(__name__)


async def download_file(url, path, session):
    async with session.get(url) as response:
        if response.status != 200:
            return

        async with aiofiles.open(path, mode='wb') as f:
            await f.write(await response.read())


async def download_apk_and_extract_resources(session: aiohttp.ClientSession):
    api_base = 'https://install.appcenter.ms/api/v0.1'
    parameterized_url = 'apps/drklo-2kb-ghpo/telegram-beta-2/distribution_groups/all-users-of-telegram-beta-2'
    base_url = f'{api_base}/{parameterized_url}'

    async def make_req(url):
        async with session.get(url) as response:
            if response.status != 200:
                return

            return await response.json(encoding='UTF-8')

    json = await make_req(f'{base_url}/public_releases')
    if json and json[0]:
        latest_id = json[0]['id']
    else:
        return

    json = await make_req(f'{base_url}/releases/{latest_id}')
    if json:
        download_url = json['download_url']
    else:
        return

    await download_file('https://bitbucket.org/iBotPeaches/apktool/downloads/apktool_2.6.1.jar', 'tool.apk', session)
    await download_file(download_url, 'app.apk', session)

    def cleanup():
        os.path.isdir('app') and shutil.rmtree('app')
        os.remove('tool.apk')
        os.remove('app.apk')

    process = await asyncio.create_subprocess_exec('java', '-jar', 'tool.apk', 'd', '-s', '-f', 'app.apk')
    await process.communicate()

    if process.returncode != 0:
        cleanup()
        return

    files_to_track = [
        'res/values/strings.xml',
        'res/values/public.xml'
    ]

    for file in files_to_track:
        filename = os.path.join(OUTPUT_FOLDER, 'telegram-beta-android', file)
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        async with aiofiles.open(filename, 'w') as w_file:
            async with aiofiles.open(os.path.join('app', file), 'r') as r_file:
                content = await r_file.read()
                content = re.sub(r'id=".*"', 'id="tgcrawl"', content)
                await w_file.write(content)

    cleanup()


async def collect_translations_paginated_content(url: str, session: aiohttp.ClientSession) -> str:
    headers = {'X-Requested-With': 'XMLHttpRequest'}
    content = list()

    async def _get_page(offset: int):
        logger.info(f'Url: {url}, offset: {offset}')
        data = {'offset': offset, 'more': 1}

        try:
            async with session.post(
                    f'{PROTOCOL}{url}', data=data, headers=headers, allow_redirects=False, timeout=TIMEOUT
            ) as response:
                if response.status != 200:
                    logger.debug(f'Resend cuz {response.status}')
                    return await asyncio.gather(_get_page(offset))

                json = await response.json(encoding='UTF-8')
                if 'more_html' in json and json['more_html']:
                    content.append(json['more_html'])
                    await asyncio.gather(_get_page(offset + 200))
        except (TimeoutError, ClientConnectorError):
            logger.warning(f'Client or timeout error. Retrying {url}; offset {offset}')
            await asyncio.gather(_get_page(offset))

    await _get_page(0)

    return '\n'.join(content)


async def crawl(url: str, session: aiohttp.ClientSession):
    try:
        logger.info(f'Process {url}')
        async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response:
            if response.status // 100 == 5:
                logger.warning(f'Error 5XX. Retrying {url}')
                return await asyncio.gather(crawl(url, session))

            if response.status not in {200, 304}:
                if response.status != 302:
                    content = await response.text()
                    logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}')
                return

            # bypass external slashes and so on
            url_parts = [p for p in url.split('/') if p not in ILLEGAL_PATH_CHARS]
            # handle pure domains and html pages without ext in url
            ext = '.html' if '.' not in url_parts[-1] or len(url_parts) == 1 else ''

            filename = OUTPUT_FOLDER + '/'.join(url_parts) + ext

            content = await response.text(encoding='UTF-8')
            if re.search(TRANSLATIONS_EN_CATEGORY_URL_REGEX, url):
                content = await collect_translations_paginated_content(url, session)

            os.makedirs(os.path.dirname(filename), exist_ok=True)
            async with aiofiles.open(filename, 'w') as f:
                content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
                content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content)
                content = re.sub(PASSPORT_SSID_REGEX, PASSPORT_SSID_TEMPLATE, content)
                content = re.sub(NONCE_REGEX, NONCE_TEMPLATE, content)
                content = re.sub(PROXY_CONFIG_SUB_NET_REGEX, PROXY_CONFIG_SUB_NET_TEMPLATE, content)
                content = re.sub(TRANSLATE_SUGGESTION_REGEX, '', content)

                logger.info(f'Write to {filename}')
                await f.write(content)
    except (TimeoutError, ClientConnectorError):
        logger.warning(f'Client or timeout error. Retrying {url}')
        await asyncio.gather(crawl(url, session))


async def start(url_list: set[str]):
    async with aiohttp.ClientSession(connector=CONNECTOR) as session:
        await asyncio.gather(*[crawl(url, session) for url in url_list])

        # yeap it will be called each run, and what? ;d
        await download_apk_and_extract_resources(session)


if __name__ == '__main__':
    with open(INPUT_FILENAME, 'r') as f:
        tracked_urls = set([l.replace('\n', '') for l in f.readlines()])

    logger.info(f'Start crawling content of {len(tracked_urls)} tracked urls...')
    start_time = time()
    asyncio.get_event_loop().run_until_complete(start(tracked_urls))
    logger.info(f'Stop crawling content. {time() - start_time} sec.')
move tracked links list to main branch; add generation of content tree; add workflow for tree generation; add new tg hidden link. 2021-04-24 14:19:01 +02:00			`import asyncio`
			`import logging`
			`import os`
delete dynamic part of html 2021-04-24 14:38:39 +02:00			`import re`
fix tracking 2022-03-18 21:30:43 +01:00			`import shutil`
move tracked links list to main branch; add generation of content tree; add workflow for tree generation; add new tg hidden link. 2021-04-24 14:19:01 +02:00			`from asyncio.exceptions import TimeoutError`
			`from string import punctuation, whitespace`
			`from time import time`

			`import aiofiles`
			`import aiohttp`
			`from aiohttp import ClientConnectorError`

			`PROTOCOL = 'https://'`
			`ILLEGAL_PATH_CHARS = punctuation.replace('.', '') + whitespace`

sub passport ssid and nonce 2021-04-24 16:42:40 +02:00			`DYNAMIC_PART_MOCK = 'telegram-crawler'`

move tracked links list to main branch; add generation of content tree; add workflow for tree generation; add new tg hidden link. 2021-04-24 14:19:01 +02:00			`INPUT_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_links.txt')`
			`OUTPUT_FOLDER = os.environ.get('OUTPUT_FOLDER', 'data/')`

add tracking of all en translations strings 2022-03-12 01:20:05 +01:00			`TRANSLATIONS_EN_CATEGORY_URL_REGEX = r'/en/[a-z_]+/[a-z_]+/$'`

delete dynamic part of html 2021-04-24 14:38:39 +02:00			`PAGE_GENERATION_TIME_REGEX = r'<!-- page generated in .+ -->'`
fix regex for sub hash 2021-04-24 15:16:10 +02:00			`PAGE_API_HASH_REGEX = r'\?hash=[a-z0-9]+'`
sub passport ssid and nonce 2021-04-24 16:42:40 +02:00			`PAGE_API_HASH_TEMPLATE = f'?hash={DYNAMIC_PART_MOCK}'`
			`PASSPORT_SSID_REGEX = r'passport_ssid=[a-z0-9]+_[a-z0-9]+_[a-z0-9]+'`
			`PASSPORT_SSID_TEMPLATE = f'passport_ssid={DYNAMIC_PART_MOCK}'`
			`NONCE_REGEX = r'"nonce":"[a-z0-9]+_[a-z0-9]+_[a-z0-9]+'`
			`NONCE_TEMPLATE = f'"nonce":"{DYNAMIC_PART_MOCK}'`
exclude subnets 2 2021-10-01 20:39:52 +02:00			`PROXY_CONFIG_SUB_NET_REGEX = r'\d+\.\d+:8888;'`
			`PROXY_CONFIG_SUB_NET_TEMPLATE = 'X.X:8888;'`
exclude translate suggestions 2022-03-15 10:37:27 +01:00			`TRANSLATE_SUGGESTION_REGEX = r'<div class="tr-value-suggestion">(.?)+</div>'`
delete dynamic part of html 2021-04-24 14:38:39 +02:00
move tracked links list to main branch; add generation of content tree; add workflow for tree generation; add new tg hidden link. 2021-04-24 14:19:01 +02:00			`# unsecure but so simple`
			`CONNECTOR = aiohttp.TCPConnector(ssl=False)`
add tracking of all en translations strings 2022-03-12 01:20:05 +01:00			`TIMEOUT = aiohttp.ClientTimeout(total=10)`
move tracked links list to main branch; add generation of content tree; add workflow for tree generation; add new tg hidden link. 2021-04-24 14:19:01 +02:00
			`logging.basicConfig(format='%(message)s', level=logging.DEBUG)`
			`logger = logging.getLogger(__name__)`


add tracking of resource table of telegram beta for android 2022-03-19 15:56:19 +01:00			`async def download_file(url, path, session):`
			`async with session.get(url) as response:`
			`if response.status != 200:`
			`return`

			`async with aiofiles.open(path, mode='wb') as f:`
			`await f.write(await response.read())`


Revert "Revert "add crawling of strings from telegram beta for android"" This reverts commit fba4626c10ba1b9050f008a1c513491641411c4d. 2022-03-18 21:25:00 +01:00			`async def download_apk_and_extract_resources(session: aiohttp.ClientSession):`
			`api_base = 'https://install.appcenter.ms/api/v0.1'`
			`parameterized_url = 'apps/drklo-2kb-ghpo/telegram-beta-2/distribution_groups/all-users-of-telegram-beta-2'`
add tracking of resource table of telegram beta for android 2022-03-19 15:56:19 +01:00			`base_url = f'{api_base}/{parameterized_url}'`
Revert "Revert "add crawling of strings from telegram beta for android"" This reverts commit fba4626c10ba1b9050f008a1c513491641411c4d. 2022-03-18 21:25:00 +01:00
add tracking of resource table of telegram beta for android 2022-03-19 15:56:19 +01:00			`async def make_req(url):`
			`async with session.get(url) as response:`
			`if response.status != 200:`
			`return`
Revert "Revert "add crawling of strings from telegram beta for android"" This reverts commit fba4626c10ba1b9050f008a1c513491641411c4d. 2022-03-18 21:25:00 +01:00
add tracking of resource table of telegram beta for android 2022-03-19 15:56:19 +01:00			`return await response.json(encoding='UTF-8')`
Revert "Revert "add crawling of strings from telegram beta for android"" This reverts commit fba4626c10ba1b9050f008a1c513491641411c4d. 2022-03-18 21:25:00 +01:00
add tracking of resource table of telegram beta for android 2022-03-19 15:56:19 +01:00			`json = await make_req(f'{base_url}/public_releases')`
			`if json and json[0]:`
			`latest_id = json[0]['id']`
			`else:`
Revert "Revert "add crawling of strings from telegram beta for android"" This reverts commit fba4626c10ba1b9050f008a1c513491641411c4d. 2022-03-18 21:25:00 +01:00			`return`

add tracking of resource table of telegram beta for android 2022-03-19 15:56:19 +01:00			`json = await make_req(f'{base_url}/releases/{latest_id}')`
			`if json:`
Revert "Revert "add crawling of strings from telegram beta for android"" This reverts commit fba4626c10ba1b9050f008a1c513491641411c4d. 2022-03-18 21:25:00 +01:00			`download_url = json['download_url']`
add tracking of resource table of telegram beta for android 2022-03-19 15:56:19 +01:00			`else:`
Revert "Revert "add crawling of strings from telegram beta for android"" This reverts commit fba4626c10ba1b9050f008a1c513491641411c4d. 2022-03-18 21:25:00 +01:00			`return`

add tracking of resource table of telegram beta for android 2022-03-19 15:56:19 +01:00			`await download_file('https://bitbucket.org/iBotPeaches/apktool/downloads/apktool_2.6.1.jar', 'tool.apk', session)`
			`await download_file(download_url, 'app.apk', session)`
Revert "Revert "add crawling of strings from telegram beta for android"" This reverts commit fba4626c10ba1b9050f008a1c513491641411c4d. 2022-03-18 21:25:00 +01:00
add tracking of resource table of telegram beta for android 2022-03-19 15:56:19 +01:00			`def cleanup():`
			`os.path.isdir('app') and shutil.rmtree('app')`
			`os.remove('tool.apk')`
			`os.remove('app.apk')`
Revert "Revert "add crawling of strings from telegram beta for android"" This reverts commit fba4626c10ba1b9050f008a1c513491641411c4d. 2022-03-18 21:25:00 +01:00
add tracking of resource table of telegram beta for android 2022-03-19 15:56:19 +01:00			`process = await asyncio.create_subprocess_exec('java', '-jar', 'tool.apk', 'd', '-s', '-f', 'app.apk')`
			`await process.communicate()`
Revert "Revert "add crawling of strings from telegram beta for android"" This reverts commit fba4626c10ba1b9050f008a1c513491641411c4d. 2022-03-18 21:25:00 +01:00
add tracking of resource table of telegram beta for android 2022-03-19 15:56:19 +01:00			`if process.returncode != 0:`
			`cleanup()`
			`return`
Revert "Revert "add crawling of strings from telegram beta for android"" This reverts commit fba4626c10ba1b9050f008a1c513491641411c4d. 2022-03-18 21:25:00 +01:00
add tracking of resource table of telegram beta for android 2022-03-19 15:56:19 +01:00			`files_to_track = [`
			`'res/values/strings.xml',`
			`'res/values/public.xml'`
			`]`

			`for file in files_to_track:`
			`filename = os.path.join(OUTPUT_FOLDER, 'telegram-beta-android', file)`
			`os.makedirs(os.path.dirname(filename), exist_ok=True)`
			`async with aiofiles.open(filename, 'w') as w_file:`
			`async with aiofiles.open(os.path.join('app', file), 'r') as r_file:`
			`content = await r_file.read()`
			`content = re.sub(r'id=".*"', 'id="tgcrawl"', content)`
			`await w_file.write(content)`

			`cleanup()`
fix tracking 2022-03-18 21:30:43 +01:00
Revert "Revert "add crawling of strings from telegram beta for android"" This reverts commit fba4626c10ba1b9050f008a1c513491641411c4d. 2022-03-18 21:25:00 +01:00
add tracking of all en translations strings 2022-03-12 01:20:05 +01:00			`async def collect_translations_paginated_content(url: str, session: aiohttp.ClientSession) -> str:`
			`headers = {'X-Requested-With': 'XMLHttpRequest'}`
			`content = list()`

			`async def _get_page(offset: int):`
			`logger.info(f'Url: {url}, offset: {offset}')`
			`data = {'offset': offset, 'more': 1}`

			`try:`
			`async with session.post(`
			`f'{PROTOCOL}{url}', data=data, headers=headers, allow_redirects=False, timeout=TIMEOUT`
			`) as response:`
			`if response.status != 200:`
			`logger.debug(f'Resend cuz {response.status}')`
			`return await asyncio.gather(_get_page(offset))`

			`json = await response.json(encoding='UTF-8')`
			`if 'more_html' in json and json['more_html']:`
			`content.append(json['more_html'])`
			`await asyncio.gather(_get_page(offset + 200))`
			`except (TimeoutError, ClientConnectorError):`
			`logger.warning(f'Client or timeout error. Retrying {url}; offset {offset}')`
			`await asyncio.gather(_get_page(offset))`

			`await _get_page(0)`

			`return '\n'.join(content)`


move tracked links list to main branch; add generation of content tree; add workflow for tree generation; add new tg hidden link. 2021-04-24 14:19:01 +02:00			`async def crawl(url: str, session: aiohttp.ClientSession):`
			`try:`
			`logger.info(f'Process {url}')`
add tracking of all en translations strings 2022-03-12 01:20:05 +01:00			`async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response:`
resend requests with 5XX status code 2022-02-24 21:58:17 +01:00			`if response.status // 100 == 5:`
			`logger.warning(f'Error 5XX. Retrying {url}')`
resend requests with server error response 2021-06-21 18:39:20 +02:00			`return await asyncio.gather(crawl(url, session))`
add logging of skips 2021-06-21 16:09:49 +02:00
add 304 as success response; add handling of xml; add sparkle file of macOS client; fix request retrying with 500 status code; fix encoding. 2021-12-30 14:22:01 +01:00			`if response.status not in {200, 304}:`
resend requests with server error response 2021-06-21 18:39:20 +02:00			`if response.status != 302:`
			`content = await response.text()`
			`logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}')`
move tracked links list to main branch; add generation of content tree; add workflow for tree generation; add new tg hidden link. 2021-04-24 14:19:01 +02:00			`return`

			`# bypass external slashes and so on`
			`url_parts = [p for p in url.split('/') if p not in ILLEGAL_PATH_CHARS]`
			`# handle pure domains and html pages without ext in url`
			`ext = '.html' if '.' not in url_parts[-1] or len(url_parts) == 1 else ''`

			`filename = OUTPUT_FOLDER + '/'.join(url_parts) + ext`

add tracking of all en translations strings 2022-03-12 01:20:05 +01:00			`content = await response.text(encoding='UTF-8')`
			`if re.search(TRANSLATIONS_EN_CATEGORY_URL_REGEX, url):`
			`content = await collect_translations_paginated_content(url, session)`

move tracked links list to main branch; add generation of content tree; add workflow for tree generation; add new tg hidden link. 2021-04-24 14:19:01 +02:00			`os.makedirs(os.path.dirname(filename), exist_ok=True)`
			`async with aiofiles.open(filename, 'w') as f:`
delete dynamic part of html 2021-04-24 14:38:39 +02:00			`content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)`
sub dynamic api hash from html 2021-04-24 15:01:41 +02:00			`content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content)`
sub passport ssid and nonce 2021-04-24 16:42:40 +02:00			`content = re.sub(PASSPORT_SSID_REGEX, PASSPORT_SSID_TEMPLATE, content)`
			`content = re.sub(NONCE_REGEX, NONCE_TEMPLATE, content)`
exclude subnets 2021-10-01 20:06:17 +02:00			`content = re.sub(PROXY_CONFIG_SUB_NET_REGEX, PROXY_CONFIG_SUB_NET_TEMPLATE, content)`
exclude translate suggestions 2022-03-15 10:37:27 +01:00			`content = re.sub(TRANSLATE_SUGGESTION_REGEX, '', content)`
delete dynamic part of html 2021-04-24 14:38:39 +02:00
			`logger.info(f'Write to {filename}')`
move tracked links list to main branch; add generation of content tree; add workflow for tree generation; add new tg hidden link. 2021-04-24 14:19:01 +02:00			`await f.write(content)`
			`except (TimeoutError, ClientConnectorError):`
resend requests with 5XX status code 2022-02-24 21:58:17 +01:00			`logger.warning(f'Client or timeout error. Retrying {url}')`
move tracked links list to main branch; add generation of content tree; add workflow for tree generation; add new tg hidden link. 2021-04-24 14:19:01 +02:00			`await asyncio.gather(crawl(url, session))`


			`async def start(url_list: set[str]):`
			`async with aiohttp.ClientSession(connector=CONNECTOR) as session:`
			`await asyncio.gather(*[crawl(url, session) for url in url_list])`

Revert "Revert "add crawling of strings from telegram beta for android"" This reverts commit fba4626c10ba1b9050f008a1c513491641411c4d. 2022-03-18 21:25:00 +01:00			`# yeap it will be called each run, and what? ;d`
			`await download_apk_and_extract_resources(session)`

move tracked links list to main branch; add generation of content tree; add workflow for tree generation; add new tg hidden link. 2021-04-24 14:19:01 +02:00
			`if __name__ == '__main__':`
			`with open(INPUT_FILENAME, 'r') as f:`
			`tracked_urls = set([l.replace('\n', '') for l in f.readlines()])`

			`logger.info(f'Start crawling content of {len(tracked_urls)} tracked urls...')`
			`start_time = time()`
			`asyncio.get_event_loop().run_until_complete(start(tracked_urls))`
			`logger.info(f'Stop crawling content. {time() - start_time} sec.')`