rework exclude system; add exclude by count of slashes; add translations.telegram.org

2024-11-22 07:19:34 +01:00 · 2021-04-24 00:06:50 +02:00 · 2021-04-24 00:06:50 +02:00 · b2afbd72fc
commit b2afbd72fc
parent e385f45da6
1 changed files with 73 additions and 53 deletions
--- a/make_tracked_links_list.py
+++ b/make_tracked_links_list.py
@ -16,45 +16,66 @@ HIDDEN_URLS = {

    'telegram.org/privacy/gmailbot',
    'telegram.org/tos',
-    'telegram.org/tour'
+    'telegram.org/tour',
+
+    'translations.telegram.org',
+    'translations.telegram.org/en/android',
+    'translations.telegram.org/en/ios',
+    'translations.telegram.org/en/tdesktop',
+    'translations.telegram.org/en/macos',
+    'translations.telegram.org/en/android_x',
 }
 BASE_URL_REGEX = r'telegram.org'

+# disable crawling sub links for specific domains and url patches
 EXCLUDE_RULES = {
-    # '' means exclude all
+    # '*' means exclude all
    'translations.telegram.org': {
-        '',
+        # 'max_count_of_slashes': 3,
+        'patches': {
+            '*',
+        }
    },
    'bugs.telegram.org': {
-        'c/',
+        'patches': {
+            'c/',
+        },
    },
    'instantview.telegram.org': {
-        'file/',
+        'patches': {
+            'file/',

-        'templates/',
-        'samples/',
-        'contest/',
+            'templates/',
+            'samples/',
+            'contest/',
+        },
    },
    'corefork.telegram.org': {
-        'file/',
+        'patches': {
+            'file/',

-        'tdlib/',
+            'tdlib/docs/',

-        'constructor/',
-        'method/',
-        'type/',
+            'constructor/',
+            'method/',
+            'type/',
+        },
    },
    'core.telegram.org': {
-        'file/',
+        'patches': {
+            'file/',

-        'tdlib/',
+            'tdlib/docs/',

-        'constructor/',
-        'method/',
-        'type/',
+            'constructor/',
+            'method/',
+            'type/',
+        },
    },
    'telegram.org': {
-        'file/',
+        'patches': {
+            'file/',
+        },
    }
 }

@ -76,35 +97,37 @@ VISITED_LINKS = set()
 LINKS_TO_TRACK = set()


+def should_exclude(url: str, direct_link=None) -> bool:
+    if not direct_link:
+        direct_link = re.findall(DIRECT_LINK_REGEX, url)[0]
+    domain_exclude_rules = EXCLUDE_RULES.get(direct_link, dict())
+
+    max_count_of_slashes = domain_exclude_rules.get('max_count_of_slashes')
+    exclude_patches = domain_exclude_rules.get('patches', set())
+
+    if '*' in exclude_patches:
+        return True
+
+    if max_count_of_slashes and max_count_of_slashes < url.count('/'):
+        return True
+
+    for path in exclude_patches:
+        if path in url:
+            return True
+
+    return False
+
+
 def find_absolute_links(html: str) -> set[str]:
    absolute_links = set(re.findall(ABSOLUTE_LINK_REGEX, html))

-    filtered_links = set()
-    for link in absolute_links:
-        def _():
-            direct_link = re.findall(DIRECT_LINK_REGEX, link)[0]
-            exceptions = EXCLUDE_RULES.get(direct_link, set())
-            # optimization. when we want to exclude domain
-            if '' in exceptions:
-                return
-
-            for exclude_path in exceptions:
-                if exclude_path in link:
-                    return
-
-            filtered_links.add(link)
-
-        _()
-        # Yeah, I don't care about DRY
-
-    return filtered_links
+    return {link for link in absolute_links if not should_exclude(link)}


 def find_relative_links(html: str, cur_link: str) -> set[str]:
    direct_cur_link = re.findall(DIRECT_LINK_REGEX, cur_link)[0]
-    exceptions = EXCLUDE_RULES.get(direct_cur_link, set())
    # optimization. when we want to exclude domain
-    if '' in exceptions:
+    if should_exclude(cur_link):
        return set()

    relative_links = set()
@ -113,20 +136,17 @@ def find_relative_links(html: str, cur_link: str) -> set[str]:
        links = re.findall(regex, html)

        for link in links:
-            def _():
-                for exclude_path in exceptions:
-                    if exclude_path in link:
-                        return
+            if should_exclude(link, direct_cur_link):
+                continue

-                if link.startswith('/'):
-                    # bypass //www.apple and etc shit ;d
-                    if find_absolute_links(link):
-                        # absolute links starting with double slash
+            if link.startswith('/'):
+                # bypass //www.apple and etc shit ;d
+                if find_absolute_links(link):
+                    # absolute links starting with double slash
+                    if not should_exclude(link):
                        relative_links.add(link[1::])
-                else:
-                    relative_links.add(f'{direct_cur_link}/{link}')
-
-            _()
+            else:
+                relative_links.add(f'{direct_cur_link}/{link}')

    return relative_links

@ -187,7 +207,7 @@ async def crawl(url: str, session: aiohttp.ClientSession):
                # TODO track hashes of image/svg/video content types
                logger.info(f'Unhandled type: {content_type}')
    except:
-        logger.warning('Codec can\'t decode byte. So its was a tgs file')
+        logger.warning('Mb codec can\'t decode byte. So its was a tgs file')


 async def start(url_list: set[str]):