reorganise structure of data folder; separate the "making files tree" workflow on 2 part (should be separated by 4 later)

2025-01-11 12:41:37 +01:00 · 2022-05-13 11:57:11 +02:00 · 2022-05-13 11:57:11 +02:00 · 9cab070822
commit 9cab070822
parent b9fc0ea79f
5 changed files with 1636 additions and 1564 deletions
--- a/.github/workflows/make_files_tree.yml
+++ b/.github/workflows/make_files_tree.yml
@ -12,8 +12,17 @@ jobs:
  fetch_new_content:
    name: Make files tree
    runs-on: macos-10.15
+    continue-on-error: true
    timeout-minutes: 10

+    strategy:
+      fail-fast: false
+      matrix:
+        mode: [
+          "0",
+          "6"
+        ]
+
    steps:

      - name: Clone.
@ -37,18 +46,35 @@ jobs:
          TELEGRAM_SESSION_TEST: ${{ secrets.TELEGRAM_SESSION_TEST }}
          TELEGRAM_API_ID: ${{ secrets.TELEGRAM_API_ID }}
          TELEGRAM_API_HASH: ${{ secrets.TELEGRAM_API_HASH }}
+          MODE: ${{ matrix.mode }}
        run: |
          git pull
          python make_files_tree.py
          rm -rf __pycache__

-      - name: Commit and push changes.
+      - name: Prepare data for mode 0.
+        if: matrix.mode == '0'
        run: |
          git checkout data

+          mv data/web_res data_ci/web_res
+          rm -rf data
+          mv data_ci data
+
+      - name: Prepare data for mode 6.
+        if: matrix.mode == '6'
+        run: |
+          git checkout data
+
+          mv data/web data_ci/web
+          mv data/client data_ci/client
+          mv data/server data_ci/server
+
          rm -rf data
          mv data_ci data

+      - name: Commit and push changes.
+        run: |
          git config --global user.email "github-action@users.noreply.github.com"
          git config --global user.name "GitHub Action"

--- a/make_files_tree.py
+++ b/make_files_tree.py
@ -1,4 +1,5 @@
 import asyncio
+import hashlib
 import json
 import logging
 import os
@ -7,7 +8,6 @@ import re
 import shutil
 import sys
 import zipfile
-import hashlib
 from asyncio.exceptions import TimeoutError
 from string import punctuation, whitespace
 from time import time
@ -25,7 +25,12 @@ ILLEGAL_PATH_CHARS = punctuation.replace('.', '') + whitespace
 DYNAMIC_PART_MOCK = 'telegram-crawler'

 INPUT_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_links.txt')
+INPUT_RES_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_res_links.txt')
 OUTPUT_FOLDER = os.environ.get('OUTPUT_FOLDER', 'data/')
+OUTPUT_MTPROTO_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_MTPROTO_FOLDER', 'server/'))
+OUTPUT_SITES_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_SITES_FOLDER', 'web/'))
+OUTPUT_CLIENTS_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_CLIENTS_FOLDER', 'client/'))
+OUTPUT_RESOURCES_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_RESOURCES_FOLDER', 'web_res/'))

 TRANSLATIONS_EN_CATEGORY_URL_REGEX = r'/en/[a-z_]+/[a-z_]+/$'

@ -44,7 +49,7 @@ SPARKLE_SE_REGEX = r';se=(.*?);'
 SPARKLE_SIG_TEMPLATE = f';sig={DYNAMIC_PART_MOCK};'
 SPARKLE_SE_TEMPLATE = f';se={DYNAMIC_PART_MOCK};'

-stel_dev_layer = 190
+STEL_DEV_LAYER = 190

 # unsecure but so simple
 CONNECTOR = aiohttp.TCPConnector(ssl=False, force_close=True, limit=300)
@ -56,7 +61,7 @@ HEADERS = {
    'Accept-Encoding': 'gzip, deflate, br',
    'DNT': '1',
    'Connection': 'keep-alive',
-    'Cookie': f'stel_ln=en; stel_dev_layer={stel_dev_layer}',
+    'Cookie': f'stel_ln=en; stel_dev_layer={STEL_DEV_LAYER}',
    'Upgrade-Insecure-Requests': '1',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
@ -127,7 +132,7 @@ async def track_additional_files(
            content = re.sub(r'id=".*"', 'id="tgcrawl"', content)
            content = re.sub(r'name="APKTOOL_DUMMY_.*" id', 'name="tgcrawl" id', content)

-        filename = os.path.join(OUTPUT_FOLDER, output_dir_name, file)
+        filename = os.path.join(output_dir_name, file)
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        async with aiofiles.open(filename, 'w', encoding='utf-8') as w_file:
            await w_file.write(content)
@ -140,7 +145,7 @@ async def download_telegram_macos_beta_and_extract_resources(session: aiohttp.Cl
    if not download_url:
        return

-    crawled_data_folder = 'telegram-beta-macos'
+    crawled_data_folder = os.path.join(OUTPUT_CLIENTS_FOLDER, 'macos-beta')
    client_folder_name = 'macos'
    client_archive_name = 'macos.zip'

@ -219,7 +224,7 @@ async def download_telegram_ios_beta_and_extract_resources(session: aiohttp.Clie
    assets_filename = 'Assets.car'
    assets_output_dir = 'ios_assets'
    client_folder_name = 'ios'
-    crawled_data_folder = 'telegram-beta-ios'
+    crawled_data_folder = os.path.join(OUTPUT_CLIENTS_FOLDER, 'ios-beta')

    if 'darwin' not in platform.system().lower():
        await download_file(download_url, ipa_filename, session)
@ -304,6 +309,8 @@ async def download_telegram_android_beta_and_extract_resources(session: aiohttp.
    parameterized_url = 'apps/drklo-2kb-ghpo/telegram-beta-2/distribution_groups/all-users-of-telegram-beta-2'
    download_url = await get_download_link_of_latest_appcenter_release(parameterized_url, session)

+    crawled_data_folder = os.path.join(OUTPUT_CLIENTS_FOLDER, 'android-beta')
+
    if not download_url:
        return

@ -328,7 +335,7 @@ async def download_telegram_android_beta_and_extract_resources(session: aiohttp.
        'res/values/strings.xml',
        'res/values/public.xml'
    ]
-    await track_additional_files(files_to_track, 'android', 'telegram-beta-android')
+    await track_additional_files(files_to_track, 'android', crawled_data_folder)

    cleanup()

@ -428,9 +435,8 @@ async def _fetch_and_track_mtproto(app, output_dir):
    configs['GetConfig'].expires = 0
    configs['GetConfig'].dc_options = []

-    output_dir_name = os.path.join('telegram-mtproto', output_dir)
    for file, content in configs.items():
-        filename = os.path.join(OUTPUT_FOLDER, output_dir_name, f'{file}.json')
+        filename = os.path.join(OUTPUT_MTPROTO_FOLDER, output_dir, f'{file}.json')
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        async with aiofiles.open(filename, 'w', encoding='utf-8') as w_file:
            await w_file.write(str(content))
@ -460,20 +466,20 @@ class RetryError(Exception):
    ...


-async def crawl(url: str, session: aiohttp.ClientSession):
-    ok = False
-    while not ok:
+async def crawl(url: str, session: aiohttp.ClientSession, output_dir: str = OUTPUT_SITES_FOLDER):
+    while True:
        try:
-            await _crawl(url, session)
-            ok = True
+            await _crawl(url, session, output_dir)
        except (RetryError, ServerDisconnectedError, TimeoutError, ClientConnectorError):
            logger.warning(f'Client or timeout error. Retrying {url}')
+        else:
+            break


-async def _crawl(url: str, session: aiohttp.ClientSession):
+async def _crawl(url: str, session: aiohttp.ClientSession, output_dir: str):
    logger.info(f'Process {url}')
    async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT, headers=HEADERS) as response:
-        if response.status // 100 == 5:
+        if 499 < response.status < 600:
            msg = f'Error 5XX. Retrying {url}'
            logger.warning(msg)
            raise RetryError(msg)
@ -502,7 +508,7 @@ async def _crawl(url: str, session: aiohttp.ClientSession):
        if is_hashable_only or is_sucking_file:
            ext = ''

-        filename = OUTPUT_FOLDER + '/'.join(url_parts) + ext
+        filename = os.path.join(output_dir, *url_parts) + ext
        os.makedirs(os.path.dirname(filename), exist_ok=True)

        if is_sucking_file or is_hashable_only:
@ -529,29 +535,44 @@ async def _crawl(url: str, session: aiohttp.ClientSession):
            await f.write(content)


-async def start(url_list: set[str], mode: int):
+async def crawl_web(session: aiohttp.ClientSession):
+    with open(INPUT_FILENAME, 'r') as f:
+        tracked_urls = set([l.replace('\n', '') for l in f.readlines()])
+
+    await asyncio.gather(*[crawl(url, session) for url in tracked_urls])
+
+
+async def crawl_web_res(session: aiohttp.ClientSession):
+    with open(INPUT_RES_FILENAME, 'r') as f:
+        tracked_urls = set([l.replace('\n', '') for l in f.readlines()])
+
+    await asyncio.gather(*[crawl(url, session, OUTPUT_RESOURCES_FOLDER) for url in tracked_urls])
+
+
+async def start(mode: int):
    async with aiohttp.ClientSession(connector=CONNECTOR) as session:
+        # all without web resources
        mode == 0 and await asyncio.gather(
-            *[crawl(url, session) for url in url_list],
+            crawl_web(session),
            download_telegram_android_beta_and_extract_resources(session),
            download_telegram_macos_beta_and_extract_resources(session),
-            # track_mtproto_configs(),
+            track_mtproto_configs(),
            download_telegram_ios_beta_and_extract_resources(session),
        )
-        mode == 1 and await asyncio.gather(*[crawl(url, session) for url in url_list])
+        mode == 1 and await crawl_web(session)
        mode == 2 and await download_telegram_android_beta_and_extract_resources(session)
        mode == 3 and await download_telegram_macos_beta_and_extract_resources(session)
        mode == 4 and await track_mtproto_configs()
        mode == 5 and await download_telegram_ios_beta_and_extract_resources(session)
+        mode == 6 and await crawl_web_res(session)


 if __name__ == '__main__':
    run_mode = int(sys.argv[1]) if len(sys.argv) > 1 else 0
-
-    with open(INPUT_FILENAME, 'r') as f:
-        tracked_urls = set([l.replace('\n', '') for l in f.readlines()])
+    if 'MODE' in os.environ:
+        run_mode = int(os.environ['MODE'])

    start_time = time()
-    logger.info(f'Start crawling content of {len(tracked_urls)} tracked urls...')
-    asyncio.get_event_loop().run_until_complete(start(tracked_urls, run_mode))
+    logger.info(f'Start crawling content of tracked urls...')
+    asyncio.get_event_loop().run_until_complete(start(run_mode))
    logger.info(f'Stop crawling content in mode {run_mode}. {time() - start_time} sec.')
--- a/make_tracked_links_list.py
+++ b/make_tracked_links_list.py
@ -135,9 +135,9 @@ RELATIVE_JS_SCRIPTS_REGEX = r'["\'](.*\.js)["\'\?]'
 DOM_ATTRS = ['href', 'src']

 OUTPUT_FILENAME = os.environ.get('OUTPUT_FILENAME', 'tracked_links.txt')
-COMPARE_OUTPUT_WITH_FILENAME = os.environ.get('COMPARE_OUTPUT_WITH_FILENAME', OUTPUT_FILENAME)
+OUTPUT_RESOURCES_FILENAME = os.environ.get('OUTPUT_RESOURCES_FILENAME', 'tracked_res_links.txt')

-stel_dev_layer = 190
+STEL_DEV_LAYER = 190

 # unsecure but so simple
 CONNECTOR = aiohttp.TCPConnector(ssl=False, force_close=True, limit=300)
@ -149,7 +149,7 @@ HEADERS = {
    'Accept-Encoding': 'gzip, deflate, br',
    'DNT': '1',
    'Connection': 'keep-alive',
-    'Cookie': f'stel_ln=en; stel_dev_layer={stel_dev_layer}',
+    'Cookie': f'stel_ln=en; stel_dev_layer={STEL_DEV_LAYER}',
    'Upgrade-Insecure-Requests': '1',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
@ -164,6 +164,7 @@ logger = logging.getLogger(__name__)

 VISITED_LINKS = set()
 LINKS_TO_TRACK = set()
+LINKS_TO_TRACKABLE_RESOURCES = set()


 def should_exclude(url: str) -> bool:
@ -270,11 +271,28 @@ def cleanup_links(links: set[str]) -> set[str]:
    return cleaned_links


-def is_trackable_content_type(content_type) -> bool:
-    trackable_content_types = (
-        'css',
+def _is_x_content_type(content_types_set: set[str], content_type) -> bool:
+    for match_content_type in content_types_set:
+        if match_content_type in content_type:
+            return True
+
+    return False
+
+
+def is_textable_content_type(content_type: str) -> bool:
+    textable_content_type = {
        'plain',
+        'css',
        'json',
+        'text',
+        'javascript',
+    }
+
+    return _is_x_content_type(textable_content_type, content_type)
+
+
+def is_trackable_content_type(content_type) -> bool:
+    trackable_content_types = {
        'svg',
        'png',
        'jpeg',
@ -284,13 +302,9 @@ def is_trackable_content_type(content_type) -> bool:
        'webm',
        'application/octet-stream',    # td updates
        'application/zip',
-    )
+    }

-    for trackable_content_type in trackable_content_types:
-        if trackable_content_type in content_type:
-            return True
-
-    return False
+    return _is_x_content_type(trackable_content_types, content_type)


 class ServerSideError(Exception):
@ -298,14 +312,14 @@ class ServerSideError(Exception):


 async def crawl(url: str, session: aiohttp.ClientSession):
-    ok = False
-    while not ok:
+    while True:
        try:
            await _crawl(url, session)
-            ok = True
        except (ServerSideError, ServerDisconnectedError, TimeoutError, ClientConnectorError):
            logger.warning(f'Client or timeout error. Retrying {url}')
            VISITED_LINKS.remove(url)
+        else:
+            break


 async def _crawl(url: str, session: aiohttp.ClientSession):
@ -318,7 +332,7 @@ async def _crawl(url: str, session: aiohttp.ClientSession):
        async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response:
            content_type = response.headers.get('content-type')

-            if response.status // 100 == 5:
+            if 499 < response.status < 600:
                VISITED_LINKS.remove(url)
                logger.warning(f'Error 5XX. Retrying {url}')
                raise ServerSideError()
@ -329,9 +343,11 @@ async def _crawl(url: str, session: aiohttp.ClientSession):
                    logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}')
                return

-            if 'text' in content_type or 'javascript' in content_type:
+            if is_textable_content_type(content_type):
                LINKS_TO_TRACK.add(url)

+                # raw content will be cached by aiohttp. Don't worry about it
+                raw_content = await response.read()
                content = await response.text(encoding='UTF-8')
                absolute_links = cleanup_links(find_absolute_links(content))

@ -344,7 +360,7 @@ async def _crawl(url: str, session: aiohttp.ClientSession):
                sub_links = absolute_links | relative_links
                await asyncio.gather(*[crawl(url, session) for url in sub_links])
            elif is_trackable_content_type(content_type):
-                LINKS_TO_TRACK.add(url)
+                LINKS_TO_TRACKABLE_RESOURCES.add(url)
            else:
                # for example, zip with update of macOS client
                logger.info(f'Unhandled type: {content_type} from {url}')
@ -358,8 +374,10 @@ async def _crawl(url: str, session: aiohttp.ClientSession):
                LINKS_TO_TRACK.remove(f'{without_trailing_slash}/')
    except UnicodeDecodeError:
        logger.warning(f'Codec can\'t decode bytes. So it was a tgs file or response with broken content type {url}')
-    # except ClientConnectorError:
-    #     logger.warning(f'Wrong link: {url}')
+
+        if raw_content.startswith(b'GIF89a'):
+            LINKS_TO_TRACK.remove(url)
+            LINKS_TO_TRACKABLE_RESOURCES.add(url)


 async def start(url_list: set[str]):
@ -377,14 +395,21 @@ if __name__ == '__main__':
    logger.info(f'Stop crawling links. {time() - start_time} sec.')

    try:
-        with open(COMPARE_OUTPUT_WITH_FILENAME, 'r') as f:
-            OLD_URL_LIST = set([l.replace('\n', '') for l in f.readlines()])
+        OLD_URL_LIST = set()
+        for filename in (OUTPUT_FILENAME, OUTPUT_RESOURCES_FILENAME):
+            with open(filename, 'r') as f:
+                OLD_URL_LIST |= set([l.replace('\n', '') for l in f.readlines()])

-        logger.info(f'Is equal: {OLD_URL_LIST == LINKS_TO_TRACK}')
-        logger.info(f'Deleted: {OLD_URL_LIST - LINKS_TO_TRACK}')
-        logger.info(f'Added: {LINKS_TO_TRACK - OLD_URL_LIST}')
+        CURRENT_URL_LIST = LINKS_TO_TRACK | LINKS_TO_TRACKABLE_RESOURCES
+
+        logger.info(f'Is equal: {OLD_URL_LIST == CURRENT_URL_LIST}')
+        logger.info(f'Deleted: {OLD_URL_LIST - CURRENT_URL_LIST}')
+        logger.info(f'Added: {CURRENT_URL_LIST - OLD_URL_LIST}')
    except IOError:
        pass

    with open(OUTPUT_FILENAME, 'w') as f:
        f.write('\n'.join(sorted(LINKS_TO_TRACK)))
+
+    with open(OUTPUT_RESOURCES_FILENAME, 'w') as f:
+        f.write('\n'.join(sorted(LINKS_TO_TRACKABLE_RESOURCES)))
--- a/tracked_links.txt
+++ b/tracked_links.txt
--- a/tracked_res_links.txt
+++ b/tracked_res_links.txt