welcome to web 13.37; add tracking of wallet mini app

2024-10-23 17:47:21 +02:00 · 2023-09-15 11:57:25 +02:00 · 2023-09-15 11:57:25 +02:00 · 3726d67b54
commit 3726d67b54
parent 8d1f2ee194
5 changed files with 447 additions and 1 deletions
--- a/.github/workflows/make_files_tree.yml
+++ b/.github/workflows/make_files_tree.yml
@ -35,6 +35,9 @@ jobs:
          - mode: client
            os: macos-12

+          - mode: mini_app
+            os: ubuntu-20.04
+
    steps:

      - name: Clone.
@ -74,6 +77,7 @@ jobs:
          mv data/web_tr data_ci/web_tr
          mv data/client data_ci/client
          mv data/server data_ci/server
+          mv data/mini_app data_ci/mini_app

          rm -rf data
          mv data_ci data
@ -88,6 +92,7 @@ jobs:
          mv data/web_tr data_ci/web_tr
          mv data/client data_ci/client
          mv data/server data_ci/server
+          mv data/mini_app data_ci/mini_app

          rm -rf data
          mv data_ci data
@ -102,6 +107,7 @@ jobs:
          mv data/web_res data_ci/web_res
          mv data/server data_ci/server
          mv data/client data_ci/client
+          mv data/mini_app data_ci/mini_app

          rm -rf data
          mv data_ci data
@ -116,6 +122,7 @@ jobs:
          mv data/web_res data_ci/web_res
          mv data/web_tr data_ci/web_tr
          mv data/client data_ci/client
+          mv data/mini_app data_ci/mini_app

          rm -rf data
          mv data_ci data
@ -130,6 +137,22 @@ jobs:
          mv data/web_res data_ci/web_res
          mv data/web_tr data_ci/web_tr
          mv data/server data_ci/server
+          mv data/mini_app data_ci/mini_app
+
+          rm -rf data
+          mv data_ci data
+
+      - name: Prepare data.
+        if: matrix.mode == 'mini_app'
+        run: |
+          git checkout data
+          git pull
+
+          mv data/web data_ci/web
+          mv data/web_res data_ci/web_res
+          mv data/web_tr data_ci/web_tr
+          mv data/server data_ci/server
+          mv data/client data_ci/client

          rm -rf data
          mv data_ci data
--- a/make_and_send_alert.py
+++ b/make_and_send_alert.py
@ -44,7 +44,7 @@ STATUS_TO_EMOJI = {
 }

 AVAILABLE_HASHTAGS = {
-    'web_tr', 'web_res', 'web', 'server', 'test_server', 'client', 'ios', 'macos', 'android'
+    'web_tr', 'web_res', 'web', 'server', 'test_server', 'client', 'ios', 'macos', 'android', 'mini_app', 'wallet'
 }
 HASHTAGS_PATTERNS = {
    # regex will be more flexible. for example, in issue with double hashtag '#web #web_res' when data/res not changed
@ -57,6 +57,8 @@ HASHTAGS_PATTERNS = {
    'ios': os.path.join(ROOT_TREE_DIR, 'client', 'ios-beta'),
    'macos': os.path.join(ROOT_TREE_DIR, 'client', 'macos-beta'),
    'android': os.path.join(ROOT_TREE_DIR, 'client', 'android-beta'),
+    'mini_app': os.path.join(ROOT_TREE_DIR, 'mini_app'),
+    'wallet': os.path.join(ROOT_TREE_DIR, 'mini_app', 'wallet'),
 }
 # order is important!
 PATHS_TO_REMOVE_FROM_ALERT = [
@ -65,6 +67,7 @@ PATHS_TO_REMOVE_FROM_ALERT = [
    os.path.join(ROOT_TREE_DIR, 'web'),
    os.path.join(ROOT_TREE_DIR, 'server'),
    os.path.join(ROOT_TREE_DIR, 'client'),
+    os.path.join(ROOT_TREE_DIR, 'mini_app'),
 ]

 FORUM_CHAT_ID = '@tfcrawl'
@ -76,6 +79,7 @@ HASHTAG_TO_TOPIC = {
    'ios': '2194',
    'macos': '2187',
    'android': '2190',
+    'wallet': '5685',
 }

 GITHUB_API_LIMIT_PER_HOUR = 5_000
--- a/make_files_tree.py
+++ b/make_files_tree.py
@ -33,6 +33,7 @@ OUTPUT_SITES_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_SITES_F
 OUTPUT_CLIENTS_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_CLIENTS_FOLDER', 'client/'))
 OUTPUT_RESOURCES_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_RESOURCES_FOLDER', 'web_res/'))
 OUTPUT_TRANSLATIONS_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_RESOURCES_FOLDER', 'web_tr/'))
+OUTPUT_MINI_APPS_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_MINI_APPS_FOLDER', 'mini_app/'))

 TRANSLATIONS_EN_CATEGORY_URL_REGEX = r'/en/[a-z_]+/[a-z_]+/$'

@ -351,6 +352,38 @@ async def download_telegram_android_beta_and_extract_resources(session: aiohttp.
    cleanup()


+async def crawl_mini_app_wallet():
+    crawled_data_folder = os.path.join(OUTPUT_MINI_APPS_FOLDER, 'wallet')
+
+    def cleanup():
+        os.path.isdir('wallet') and shutil.rmtree('wallet')
+
+    process = await asyncio.create_subprocess_exec(
+        'python', 'unwebpack_sourcemap.py', '--make-directory', '--detect', 'https://walletbot.me/', 'wallet',
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.STDOUT
+    )
+    await process.communicate()
+
+    if process.returncode != 0:
+        cleanup()
+        return
+
+    files_to_track = []
+
+    crawled_unpacked_folder = os.path.join('wallet', 'webpack', 'wallet-react-form', 'empty_0')
+    crawled_src_folder = os.path.join(crawled_unpacked_folder, 'src')
+    for root, folders, files in os.walk(crawled_src_folder):
+        for file in files:
+            files_to_track.append(os.path.join(root, file).replace(f'{crawled_unpacked_folder}/', ''))
+
+    await track_additional_files(
+        files_to_track, crawled_unpacked_folder, crawled_data_folder
+    )
+
+    cleanup()
+
+
 async def collect_translations_paginated_content(url: str, session: aiohttp.ClientSession) -> str:
    import cssutils
    from bs4 import BeautifulSoup
@ -713,6 +746,7 @@ async def start(mode: str):
            download_telegram_android_beta_and_extract_resources(session),
            download_telegram_macos_beta_and_extract_resources(session),
            download_telegram_ios_beta_and_extract_resources(session),
+            crawl_mini_app_wallet(),
        )
        mode == 'web' and await asyncio.gather(
            crawl_web(session),
@ -731,6 +765,9 @@ async def start(mode: str):
            download_telegram_macos_beta_and_extract_resources(session),
            download_telegram_ios_beta_and_extract_resources(session),
        )
+        mode == 'mini_app' and await asyncio.gather(
+            crawl_mini_app_wallet(),
+        )


 if __name__ == '__main__':
--- a/requirements.txt
+++ b/requirements.txt
@ -5,4 +5,5 @@ git+https://github.com/MarshalX/pyrogram
 TgCrypto==1.2.3
 beautifulsoup4==4.11.1
 cssutils==2.4.2
+requests==2.31.0
 # uvloop==0.16.0
--- a/unwebpack_sourcemap.py
+++ b/unwebpack_sourcemap.py
@ -0,0 +1,381 @@
+#!/usr/bin/env python3
+"""
+    unwebpack_sourcemap.py
+    by rarecoil (github.com/rarecoil/unwebpack-sourcemap)
+
+    Reads Webpack source maps and extracts the disclosed
+    uncompiled/commented source code for review. Can detect and
+    attempt to read sourcemaps from Webpack bundles with the `-d`
+    flag. Puts source into a directory structure similar to dev.
+
+MIT License
+
+Copyright (c) 2019 rarecoil.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+
+import argparse
+import json
+import os
+import re
+import string
+import sys
+from urllib.parse import urlparse
+from unicodedata import normalize
+
+import requests
+from bs4 import BeautifulSoup, SoupStrainer
+
+
+class SourceMapExtractor(object):
+    """Primary SourceMapExtractor class. Feed this arguments."""
+
+    _target = None
+    _is_local = False
+    _attempt_sourcemap_detection = False
+    _output_directory = ""
+    _target_extracted_sourcemaps = []
+
+    _path_sanitiser = None
+
+    def __init__(self, options):
+        """Initialize the class."""
+        if 'output_directory' not in options:
+            raise SourceMapExtractorError("output_directory must be set in options.")
+        else:
+            self._output_directory = os.path.abspath(options['output_directory'])
+            if not os.path.isdir(self._output_directory):
+                if options['make_directory'] is True:
+                    os.mkdir(self._output_directory)
+                else:
+                    raise SourceMapExtractorError(
+                        "output_directory does not exist. Pass --make-directory to auto-make it.")
+
+        self._path_sanitiser = PathSanitiser(self._output_directory)
+
+        if options['disable_ssl_verification'] == True:
+            self.disable_verify_ssl = True
+        else:
+            self.disable_verify_ssl = False
+
+        if options['local'] == True:
+            self._is_local = True
+
+        if options['detect'] == True:
+            self._attempt_sourcemap_detection = True
+
+        self._validate_target(options['uri_or_file'])
+
+    def run(self):
+        """Run extraction process."""
+        if self._is_local == False:
+            if self._attempt_sourcemap_detection:
+                detected_sourcemaps = self._detect_js_sourcemaps(self._target)
+                for sourcemap in detected_sourcemaps:
+                    self._parse_remote_sourcemap(sourcemap)
+            else:
+                self._parse_remote_sourcemap(self._target)
+
+        else:
+            self._parse_sourcemap(self._target)
+
+    def _validate_target(self, target):
+        """Do some basic validation on the target."""
+        parsed = urlparse(target)
+        if self._is_local is True:
+            self._target = os.path.abspath(target)
+            if not os.path.isfile(self._target):
+                raise SourceMapExtractorError(
+                    "uri_or_file is set to be a file, but doesn't seem to exist. check your path.")
+        else:
+            if parsed.scheme == "":
+                raise SourceMapExtractorError("uri_or_file isn't a URI, and --local was not set. set --local?")
+            file, ext = os.path.splitext(parsed.path)
+            self._target = target
+            if ext != '.map' and self._attempt_sourcemap_detection is False:
+                print("WARNING: URI does not have .map extension, and --detect is not flagged.")
+
+    def _parse_remote_sourcemap(self, uri):
+        """GET a remote sourcemap and parse it."""
+        data, final_uri = self._get_remote_data(uri)
+        if data is not None:
+            self._parse_sourcemap(data, True)
+        else:
+            print("WARNING: Could not retrieve sourcemap from URI %s" % final_uri)
+
+    def _detect_js_sourcemaps(self, uri):
+        """Pull HTML and attempt to find JS files, then read the JS files and look for sourceMappingURL."""
+        remote_sourcemaps = []
+        data, final_uri = self._get_remote_data(uri)
+
+        # TODO: scan to see if this is a sourcemap instead of assuming HTML
+        print("Detecting sourcemaps in HTML at %s" % final_uri)
+        script_strainer = SoupStrainer("script", src=True)
+        try:
+            soup = BeautifulSoup(data, "html.parser", parse_only=script_strainer)
+        except:
+            raise SourceMapExtractorError("Could not parse HTML at URI %s" % final_uri)
+
+        for script in soup:
+            source = script['src']
+            parsed_uri = urlparse(source)
+            next_target_uri = ""
+            if parsed_uri.scheme != '':
+                next_target_uri = source
+            else:
+                current_uri = urlparse(final_uri)
+                built_uri = current_uri.scheme + "://" + current_uri.netloc + source
+                next_target_uri = built_uri
+
+            js_data, last_target_uri = self._get_remote_data(next_target_uri)
+            # get last line of file
+            last_line = js_data.rstrip().split("\n")[-1]
+            regex = "\\/\\/#\s*sourceMappingURL=(.*)$"
+            matches = re.search(regex, last_line)
+            if matches:
+                asset = matches.groups(0)[0].strip()
+                asset_target = urlparse(asset)
+                if asset_target.scheme != '':
+                    print("Detected sourcemap at remote location %s" % asset)
+                    remote_sourcemaps.append(asset)
+                else:
+                    current_uri = urlparse(last_target_uri)
+                    asset_uri = current_uri.scheme + '://' + \
+                                current_uri.netloc + \
+                                os.path.dirname(current_uri.path) + \
+                                '/' + asset
+                    print("Detected sourcemap at remote location %s" % asset_uri)
+                    remote_sourcemaps.append(asset_uri)
+
+        return remote_sourcemaps
+
+    def _parse_sourcemap(self, target, is_str=False):
+        map_data = ""
+        if is_str is False:
+            if os.path.isfile(target):
+                with open(target, 'r', encoding='utf-8', errors='ignore') as f:
+                    map_data = f.read()
+        else:
+            map_data = target
+
+        # with the sourcemap data, pull directory structures
+        try:
+            map_object = json.loads(map_data)
+        except json.JSONDecodeError:
+            print("ERROR: Failed to parse sourcemap %s. Are you sure this is a sourcemap?" % target)
+            return False
+
+        # we need `sourcesContent` and `sources`.
+        # do a basic validation check to make sure these exist and agree.
+        if 'sources' not in map_object or 'sourcesContent' not in map_object:
+            print("ERROR: Sourcemap does not contain sources and/or sourcesContent, cannot extract.")
+            return False
+
+        if len(map_object['sources']) != len(map_object['sourcesContent']):
+            print("WARNING: sources != sourcesContent, filenames may not match content")
+
+        for source, content in zip(map_object['sources'], map_object['sourcesContent']):
+            # remove webpack:// from paths
+            # and do some checks on it
+            write_path = self._get_sanitised_file_path(source)
+            if write_path is None:
+                print("ERROR: Could not sanitize path %s" % source)
+                continue
+
+            os.makedirs(os.path.dirname(write_path), mode=0o755, exist_ok=True)
+            with open(write_path, 'w', encoding='utf-8', errors='ignore', newline='') as f:
+                print("Writing %s..." % os.path.basename(write_path))
+                f.write(content)
+
+    def _get_sanitised_file_path(self, sourcePath):
+        """Sanitise webpack paths for separators/relative paths"""
+        sourcePath = sourcePath.replace("webpack:///", "")
+        exts = sourcePath.split(" ")
+
+        if exts[0] == "external":
+            print("WARNING: Found external sourcemap %s, not currently supported. Skipping" % exts[1])
+            return None
+
+        path, filename = os.path.split(sourcePath)
+        if path[:2] == './':
+            path = path[2:]
+        if path[:3] == '../':
+            path = 'parent_dir/' + path[3:]
+        if path[:1] == '.':
+            path = ""
+
+        filepath = self._path_sanitiser.make_valid_file_path(path, filename)
+        return filepath
+
+    def _get_remote_data(self, uri):
+        """Get remote data via http."""
+
+        if self.disable_verify_ssl == True:
+            result = requests.get(uri, verify=False)
+        else:
+            result = requests.get(uri)
+
+        # Redirect
+        if not uri == result.url:
+            return self._get_remote_data(result.url)
+
+        if result.status_code == 200:
+            return result.text, result.url
+        else:
+            print("WARNING: Got status code %d for URI %s" % (result.status_code, result.url))
+            return None, result.url
+
+
+class PathSanitiser(object):
+    """https://stackoverflow.com/questions/13939120/sanitizing-a-file-path-in-python"""
+
+    EMPTY_NAME = "empty"
+
+    empty_idx = 0
+    root_path = ""
+
+    def __init__(self, root_path):
+        self.root_path = root_path
+
+    def ensure_directory_exists(self, path_directory):
+        if not os.path.exists(path_directory):
+            os.makedirs(path_directory)
+
+    def os_path_separators(self):
+        seps = []
+        for sep in os.path.sep, os.path.altsep:
+            if sep:
+                seps.append(sep)
+        return seps
+
+    def sanitise_filesystem_name(self, potential_file_path_name):
+        # Sort out unicode characters
+        valid_filename = normalize('NFKD', potential_file_path_name).encode('ascii', 'ignore').decode('ascii')
+        # Replace path separators with underscores
+        for sep in self.os_path_separators():
+            valid_filename = valid_filename.replace(sep, '_')
+        # Ensure only valid characters
+        valid_chars = "-_.() {0}{1}".format(string.ascii_letters, string.digits)
+        valid_filename = "".join(ch for ch in valid_filename if ch in valid_chars)
+        # Ensure at least one letter or number to ignore names such as '..'
+        valid_chars = "{0}{1}".format(string.ascii_letters, string.digits)
+        test_filename = "".join(ch for ch in potential_file_path_name if ch in valid_chars)
+        if len(test_filename) == 0:
+            # Replace empty file name or file path part with the following
+            valid_filename = self.EMPTY_NAME + '_' + str(self.empty_idx)
+
+            # MODIFIED BY MARSHALX
+            # self.empty_idx += 1
+
+        return valid_filename
+
+    def get_root_path(self):
+        # Replace with your own root file path, e.g. '/place/to/save/files/'
+        filepath = self.root_path
+        filepath = os.path.abspath(filepath)
+        # ensure trailing path separator (/)
+        if not any(filepath[-1] == sep for sep in self.os_path_separators()):
+            filepath = '{0}{1}'.format(filepath, os.path.sep)
+        self.ensure_directory_exists(filepath)
+        return filepath
+
+    def path_split_into_list(self, path):
+        # Gets all parts of the path as a list, excluding path separators
+        parts = []
+        while True:
+            newpath, tail = os.path.split(path)
+            if newpath == path:
+                assert not tail
+                if path and path not in self.os_path_separators():
+                    parts.append(path)
+                break
+            if tail and tail not in self.os_path_separators():
+                parts.append(tail)
+            path = newpath
+        parts.reverse()
+        return parts
+
+    def sanitise_filesystem_path(self, potential_file_path):
+        # Splits up a path and sanitises the name of each part separately
+        path_parts_list = self.path_split_into_list(potential_file_path)
+        sanitised_path = ''
+        for path_component in path_parts_list:
+            sanitised_path = '{0}{1}{2}'.format(sanitised_path,
+                                                self.sanitise_filesystem_name(path_component),
+                                                os.path.sep)
+        return sanitised_path
+
+    def check_if_path_is_under(self, parent_path, child_path):
+        # Using the function to split paths into lists of component parts, check that one path is underneath another
+        child_parts = self.path_split_into_list(child_path)
+        parent_parts = self.path_split_into_list(parent_path)
+        if len(parent_parts) > len(child_parts):
+            return False
+        return all(part1 == part2 for part1, part2 in zip(child_parts, parent_parts))
+
+    def make_valid_file_path(self, path=None, filename=None):
+        root_path = self.get_root_path()
+        if path:
+            sanitised_path = self.sanitise_filesystem_path(path)
+            if filename:
+                sanitised_filename = self.sanitise_filesystem_name(filename)
+                complete_path = os.path.join(root_path, sanitised_path, sanitised_filename)
+            else:
+                complete_path = os.path.join(root_path, sanitised_path)
+        else:
+            if filename:
+                sanitised_filename = self.sanitise_filesystem_name(filename)
+                complete_path = os.path.join(root_path, sanitised_filename)
+            else:
+                complete_path = complete_path
+        complete_path = os.path.abspath(complete_path)
+        if self.check_if_path_is_under(root_path, complete_path):
+            return complete_path
+        else:
+            return None
+
+
+class SourceMapExtractorError(Exception):
+    pass
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="A tool to extract code from Webpack sourcemaps. Turns black boxes into gray ones.")
+    parser.add_argument("-l", "--local", action="store_true", default=False)
+    parser.add_argument("-d", "--detect", action="store_true", default=False,
+                        help="Attempt to detect sourcemaps from JS assets in retrieved HTML.")
+    parser.add_argument("--make-directory", action="store_true", default=False,
+                        help="Make the output directory if it doesn't exist.")
+    parser.add_argument("--dangerously-write-paths", action="store_true", default=False,
+                        help="Write full paths. WARNING: Be careful here, you are pulling directories from an untrusted source.")
+    parser.add_argument("--disable-ssl-verification", action="store_true", default=False,
+                        help="The script will not verify the site's SSL certificate.")
+
+    parser.add_argument("uri_or_file", help="The target URI or file.")
+    parser.add_argument("output_directory", help="Directory to output from sourcemap to.")
+
+    if (len(sys.argv) < 3):
+        parser.print_usage()
+        sys.exit(1)
+
+    args = parser.parse_args()
+    extractor = SourceMapExtractor(vars(args))
+    extractor.run()