add tracking of all media

2024-10-23 17:47:21 +02:00 · 2022-04-10 17:46:36 +02:00 · 2022-04-10 17:46:36 +02:00 · 961320022c
commit 961320022c
parent e217c3f5ec
3 changed files with 71 additions and 20 deletions
--- a/README.md
+++ b/README.md
@ -79,10 +79,6 @@ resources (-s flag of apktool to disable disassembly of dex files).
 Writing a check for the need for decompilation by the hash of the apk file 
 would take more time.
 ### TODO list
 - add storing hashes of image, svg, video.
 ### Example of link crawler rules configuration
 ```python
--- a/make_files_tree.py
+++ b/make_files_tree.py
@ -47,6 +47,10 @@ logging.basicConfig(format='%(message)s', level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 def get_hash(data: bytes) -> str:
    return hashlib.sha256(data).hexdigest()
 async def download_file(url, path, session):
    async with session.get(url) as response:
        if response.status != 200:
@ -96,7 +100,7 @@ async def track_additional_files(
                content = await r_file.read()
                if save_hash_only:
-                    await w_file.write(hashlib.sha256(content).hexdigest())
+                    await w_file.write(get_hash(content))
                    continue
                content = re.sub(r'id=".*"', 'id="tgcrawl"', content)
@ -194,6 +198,23 @@ async def collect_translations_paginated_content(url: str, session: aiohttp.Clie
    return '\n'.join(content)
 def is_hashable_only_content_type(content_type) -> bool:
    hashable_only_content_types = (
        'png',
        'jpeg',
        'x-icon',
        'gif',
        'mp4',
        'webm',
    )
    for hashable_only_content_type in hashable_only_content_types:
        if hashable_only_content_type in content_type:
            return True
    return False
 async def crawl(url: str, session: aiohttp.ClientSession):
    try:
        logger.info(f'Process {url}')
@ -210,16 +231,35 @@ async def crawl(url: str, session: aiohttp.ClientSession):
            # bypass external slashes and so on
            url_parts = [p for p in url.split('/') if p not in ILLEGAL_PATH_CHARS]
            is_hashable_only = is_hashable_only_content_type(response.content_type)
            # amazing dirt for media files like
            # telegram.org/file/811140591/1/q7zZHjgES6s/9d121a89ffb0015837
            # with response content type HTML instead of image. Shame on you
            # sometimes it returns correct type. noice load balancing
            is_sucking_file = '/file/' in url and 'text' in response.content_type
            # handle pure domains and html pages without ext in url
            ext = '.html' if '.' not in url_parts[-1] or len(url_parts) == 1 else ''
            # I don't add ext by content type for images and so on cuz TG servers sucks.
            # Some servers do not return correct content type. Some servers do...
            if is_hashable_only or is_sucking_file:
                ext = ''
            filename = OUTPUT_FOLDER + '/'.join(url_parts) + ext
            os.makedirs(os.path.dirname(filename), exist_ok=True)
            if is_sucking_file or is_hashable_only:
                content = await response.read()
                async with aiofiles.open(filename, 'w', encoding='utf-8') as f:
                    await f.write(get_hash(content))
                return
            content = await response.text(encoding='UTF-8')
            if re.search(TRANSLATIONS_EN_CATEGORY_URL_REGEX, url):
                content = await collect_translations_paginated_content(url, session)
            os.makedirs(os.path.dirname(filename), exist_ok=True)
            async with aiofiles.open(filename, 'w', encoding='utf-8') as f:
                content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
                content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content)
--- a/make_tracked_links_list.py
+++ b/make_tracked_links_list.py
@ -12,7 +12,7 @@ from aiohttp import ClientConnectorError, ServerDisconnectedError
 PROTOCOL = 'https://'
 BASE_URL = 'telegram.org'
-# its necessary to help crawler to find more links
+# it's necessary to help crawler to find more links
 HIDDEN_URLS = {
    'corefork.telegram.org',
    'corefork.telegram.org/getProxyConfig',
@ -111,8 +111,7 @@ CRAWL_RULES = {
    },
    'telegram.org': {
        'deny': {
-            'file/',
+            r'apps$',
            r'apps$'
        },
    },
    'webz.telegram.org': {
@ -227,6 +226,28 @@ def cleanup_links(links: set[str]) -> set[str]:
    return cleaned_links
 def is_trackable_content_type(content_type) -> bool:
    trackable_content_types = (
        'javascript',
        'css',
        'plain',
        'json',
        'svg',
        'png',
        'jpeg',
        'x-icon',
        'gif',
        'mp4',
        'webm',
    )
    for trackable_content_type in trackable_content_types:
        if trackable_content_type in content_type:
            return True
    return False
 async def crawl(url: str, session: aiohttp.ClientSession):
    if url in VISITED_LINKS:
        return
@ -257,19 +278,13 @@ async def crawl(url: str, session: aiohttp.ClientSession):
                sub_links = absolute_links | relative_links
                await asyncio.gather(*[crawl(url, session) for url in sub_links])
-            elif 'application/javascript' in content_type:
+            elif is_trackable_content_type(content_type):
                LINKS_TO_TRACK.add(url)
            elif 'css' in content_type:
                LINKS_TO_TRACK.add(url)
            elif 'plain' in content_type:
                LINKS_TO_TRACK.add(url)
            elif 'application/json' in content_type:
                LINKS_TO_TRACK.add(url)
            else:
-                # TODO track hashes of image/svg/video content types
+                # for example, zip with update of macOS client
-                logger.info(f'Unhandled type: {content_type}')
+                logger.info(f'Unhandled type: {content_type} from {url}')
-            # telegram url can work with and without trailing slash (no redirect). P.S. not on every sub domain ;d
+            # telegram url can work with and without trailing slash (no redirect). P.S. not on every subdomain ;d
            # so this is a problem when we have random behavior with link will be added
            # this if resolve this issue. If available both link we prefer without trailing slash
            without_trailing_slash = url[:-1:] if url.endswith('/') else url
@ -277,7 +292,7 @@ async def crawl(url: str, session: aiohttp.ClientSession):
                    f'{without_trailing_slash}/' in LINKS_TO_TRACK:
                LINKS_TO_TRACK.remove(f'{without_trailing_slash}/')
    except UnicodeDecodeError:
-        logger.warning('Codec can\'t decode byte. So its was a tgs file')
+        logger.warning(f'Codec can\'t decode bytes. So it was a tgs file or response with broken content type {url}')
    except ClientConnectorError:
        logger.warning(f'Wrong link: {url}')
    except (ServerDisconnectedError, TimeoutError):