add tracking of all media

2024-11-21 23:06:40 +01:00 · 2022-04-10 17:46:36 +02:00 · 2022-04-10 17:46:36 +02:00 · 961320022c
commit 961320022c
parent e217c3f5ec
3 changed files with 71 additions and 20 deletions
--- a/README.md
+++ b/README.md
@ -79,10 +79,6 @@ resources (-s flag of apktool to disable disassembly of dex files).
 Writing a check for the need for decompilation by the hash of the apk file 
 would take more time.

-### TODO list
-
- add storing hashes of image, svg, video.
-
 ### Example of link crawler rules configuration

 ```python
--- a/make_files_tree.py
+++ b/make_files_tree.py
@ -47,6 +47,10 @@ logging.basicConfig(format='%(message)s', level=logging.DEBUG)
 logger = logging.getLogger(__name__)


+def get_hash(data: bytes) -> str:
+    return hashlib.sha256(data).hexdigest()
+
+
 async def download_file(url, path, session):
    async with session.get(url) as response:
        if response.status != 200:
@ -96,7 +100,7 @@ async def track_additional_files(
                content = await r_file.read()

                if save_hash_only:
-                    await w_file.write(hashlib.sha256(content).hexdigest())
+                    await w_file.write(get_hash(content))
                    continue

                content = re.sub(r'id=".*"', 'id="tgcrawl"', content)
@ -194,6 +198,23 @@ async def collect_translations_paginated_content(url: str, session: aiohttp.Clie
    return '\n'.join(content)


+def is_hashable_only_content_type(content_type) -> bool:
+    hashable_only_content_types = (
+        'png',
+        'jpeg',
+        'x-icon',
+        'gif',
+        'mp4',
+        'webm',
+    )
+
+    for hashable_only_content_type in hashable_only_content_types:
+        if hashable_only_content_type in content_type:
+            return True
+
+    return False
+
+
 async def crawl(url: str, session: aiohttp.ClientSession):
    try:
        logger.info(f'Process {url}')
@ -210,16 +231,35 @@ async def crawl(url: str, session: aiohttp.ClientSession):

            # bypass external slashes and so on
            url_parts = [p for p in url.split('/') if p not in ILLEGAL_PATH_CHARS]
+
+            is_hashable_only = is_hashable_only_content_type(response.content_type)
+            # amazing dirt for media files like
+            # telegram.org/file/811140591/1/q7zZHjgES6s/9d121a89ffb0015837
+            # with response content type HTML instead of image. Shame on you
+            # sometimes it returns correct type. noice load balancing
+            is_sucking_file = '/file/' in url and 'text' in response.content_type
+
            # handle pure domains and html pages without ext in url
            ext = '.html' if '.' not in url_parts[-1] or len(url_parts) == 1 else ''

+            # I don't add ext by content type for images and so on cuz TG servers sucks.
+            # Some servers do not return correct content type. Some servers do...
+            if is_hashable_only or is_sucking_file:
+                ext = ''
+
            filename = OUTPUT_FOLDER + '/'.join(url_parts) + ext
+            os.makedirs(os.path.dirname(filename), exist_ok=True)
+
+            if is_sucking_file or is_hashable_only:
+                content = await response.read()
+                async with aiofiles.open(filename, 'w', encoding='utf-8') as f:
+                    await f.write(get_hash(content))
+                return

            content = await response.text(encoding='UTF-8')
            if re.search(TRANSLATIONS_EN_CATEGORY_URL_REGEX, url):
                content = await collect_translations_paginated_content(url, session)

-            os.makedirs(os.path.dirname(filename), exist_ok=True)
            async with aiofiles.open(filename, 'w', encoding='utf-8') as f:
                content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
                content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content)
--- a/make_tracked_links_list.py
+++ b/make_tracked_links_list.py
@ -12,7 +12,7 @@ from aiohttp import ClientConnectorError, ServerDisconnectedError

 PROTOCOL = 'https://'
 BASE_URL = 'telegram.org'
-# its necessary to help crawler to find more links
+# it's necessary to help crawler to find more links
 HIDDEN_URLS = {
    'corefork.telegram.org',
    'corefork.telegram.org/getProxyConfig',
@ -111,8 +111,7 @@ CRAWL_RULES = {
    },
    'telegram.org': {
        'deny': {
-            'file/',
-            r'apps$'
+            r'apps$',
        },
    },
    'webz.telegram.org': {
@ -227,6 +226,28 @@ def cleanup_links(links: set[str]) -> set[str]:
    return cleaned_links


+def is_trackable_content_type(content_type) -> bool:
+    trackable_content_types = (
+        'javascript',
+        'css',
+        'plain',
+        'json',
+        'svg',
+        'png',
+        'jpeg',
+        'x-icon',
+        'gif',
+        'mp4',
+        'webm',
+    )
+
+    for trackable_content_type in trackable_content_types:
+        if trackable_content_type in content_type:
+            return True
+
+    return False
+
+
 async def crawl(url: str, session: aiohttp.ClientSession):
    if url in VISITED_LINKS:
        return
@ -257,19 +278,13 @@ async def crawl(url: str, session: aiohttp.ClientSession):

                sub_links = absolute_links | relative_links
                await asyncio.gather(*[crawl(url, session) for url in sub_links])
-            elif 'application/javascript' in content_type:
-                LINKS_TO_TRACK.add(url)
-            elif 'css' in content_type:
-                LINKS_TO_TRACK.add(url)
-            elif 'plain' in content_type:
-                LINKS_TO_TRACK.add(url)
-            elif 'application/json' in content_type:
+            elif is_trackable_content_type(content_type):
                LINKS_TO_TRACK.add(url)
            else:
-                # TODO track hashes of image/svg/video content types
-                logger.info(f'Unhandled type: {content_type}')
+                # for example, zip with update of macOS client
+                logger.info(f'Unhandled type: {content_type} from {url}')

-            # telegram url can work with and without trailing slash (no redirect). P.S. not on every sub domain ;d
+            # telegram url can work with and without trailing slash (no redirect). P.S. not on every subdomain ;d
            # so this is a problem when we have random behavior with link will be added
            # this if resolve this issue. If available both link we prefer without trailing slash
            without_trailing_slash = url[:-1:] if url.endswith('/') else url
@ -277,7 +292,7 @@ async def crawl(url: str, session: aiohttp.ClientSession):
                    f'{without_trailing_slash}/' in LINKS_TO_TRACK:
                LINKS_TO_TRACK.remove(f'{without_trailing_slash}/')
    except UnicodeDecodeError:
-        logger.warning('Codec can\'t decode byte. So its was a tgs file')
+        logger.warning(f'Codec can\'t decode bytes. So it was a tgs file or response with broken content type {url}')
    except ClientConnectorError:
        logger.warning(f'Wrong link: {url}')
    except (ServerDisconnectedError, TimeoutError):