From dcd5adc08a349f9836c63fc295a61d2d199d32b7 Mon Sep 17 00:00:00 2001
From: "Il'ya (Marshal)" <ilya@marshal.by>
Date: Sat, 24 Apr 2021 22:40:25 +0200
Subject: [PATCH] fix, improve and cleanup links crawler; add diff or files

---
 README.md                  |  3 +-
 make_tracked_links_list.py | 62 ++++++++++++++++++++------------------
 2 files changed, 34 insertions(+), 31 deletions(-)

diff --git a/README.md b/README.md
index 07902d9f94..4cbc8f7885 100644
--- a/README.md
+++ b/README.md
@@ -55,7 +55,8 @@ Copy of Telegram websites stored **[here](https://github.com/MarshalX/telegram-c
 ### TODO list
 
 - bug fixes;
-- alert system.
+- alert system;
+- add storing hashes of image, svg, video.
 
 ### Example of link crawler rules configuration
 
diff --git a/make_tracked_links_list.py b/make_tracked_links_list.py
index d8596e76a5..2219e72b01 100644
--- a/make_tracked_links_list.py
+++ b/make_tracked_links_list.py
@@ -77,8 +77,8 @@ CRAWL_RULES = {
 }
 
 DIRECT_LINK_REGEX = r'([-a-zA-Z0-9@:%._\+~#]{0,249}' + BASE_URL_REGEX + r')'
-ABSOLUTE_LINK_REGEX = r'([-a-zA-Z0-9@:%._\+~#]{1,249}' + BASE_URL_REGEX + r'\b[-a-zA-Z0-9@:%_\+.~#?&\/\/=]*)'
-RELATIVE_LINK_REGEX = r'\/([-a-zA-Z0-9\/@:%._\+~#]{0,249})'
+ABSOLUTE_LINK_REGEX = r'([-a-zA-Z0-9@:%._\+~#]{0,248}' + BASE_URL_REGEX + r'\b[-a-zA-Z0-9@:%_\+.~#?&//=]*)'
+RELATIVE_LINK_REGEX = r'\/(?!\/)([-a-zA-Z0-9\/@:%._\+~#]{0,249})'
 
 DOM_ATTRS = ['href', 'src']
 
@@ -137,16 +137,9 @@ def find_relative_links(html: str, cur_link: str) -> set[str]:
         links = re.findall(regex, html)
 
         for link in links:
-            # bypass //www.apple and etc shit ;d
-            if link.startswith('/'):
-                # absolute links starting with double slash
-                if find_absolute_links(link):
-                    if not should_exclude(link[1::]):
-                        relative_links.add(link[1::])
-            else:
-                url = f'{direct_cur_link}/{link}'
-                if not should_exclude(url):
-                    relative_links.add(url)
+            url = f'{direct_cur_link}/{link}'
+            if not should_exclude(url):
+                relative_links.add(url)
 
     return relative_links
 
@@ -159,10 +152,16 @@ def cleanup_links(links: set[str]) -> set[str]:
         link = unescape(link)
         link = link.replace('www.', '')
         link = link.replace('http://', '').replace('https://', '')
+
         # skip anchor links
         if '#' in link:
             continue
 
+        # remove get params from link
+        if '?' in link:
+            link = ''.join(link.split('?')[:-1])
+
+        # skip mailto:
         link_parts = link.split('.')
         if '@' in link_parts[0]:
             continue
@@ -173,14 +172,9 @@ def cleanup_links(links: set[str]) -> set[str]:
 
 
 async def crawl(url: str, session: aiohttp.ClientSession):
-    # todo
-    if url.endswith('.'):
+    if url in VISITED_LINKS:
         return
-
-    without_trailing_slash = url[:-1:] if url.endswith('/') else url
-    if without_trailing_slash in VISITED_LINKS:
-        return
-    VISITED_LINKS.add(without_trailing_slash)
+    VISITED_LINKS.add(url)
 
     try:
         logger.info(f'[{len(VISITED_LINKS)}] Process {url}')
@@ -188,17 +182,6 @@ async def crawl(url: str, session: aiohttp.ClientSession):
             status_code = response.status
             content_type = response.headers.get('content-type')
 
-            # if it was redirect to link with trailing slash - handle this url
-            if 300 < status_code < 400:
-                location = response.headers.get('location', '')
-                # todo rewrite logic
-                if without_trailing_slash in location:
-                    if not should_exclude(location):
-                        # nice shit bro
-                        logger.info(f'Trailing slash. {location}')
-                        cleaned_link = list(cleanup_links({location}))[0]
-                        await asyncio.gather(crawl(cleaned_link, session))
-
             if status_code != 200:
                 return
 
@@ -220,9 +203,18 @@ async def crawl(url: str, session: aiohttp.ClientSession):
             else:
                 # TODO track hashes of image/svg/video content types
                 logger.info(f'Unhandled type: {content_type}')
+
+            # telegram url can work with and without trailing slash (no redirect). P.S. not on every sub domain ;d
+            # so this is a problem when we have random behavior with link will be added
+            # this if resolve this issue. If available both link we prefer without trailing slash
+            without_trailing_slash = url[:-1:] if url.endswith('/') else url
+            if without_trailing_slash in LINKS_TO_TRACK and \
+                    f'{without_trailing_slash}/' in LINKS_TO_TRACK:
+                LINKS_TO_TRACK.remove(f'{without_trailing_slash}/')
     except UnicodeDecodeError:
         logger.warning('Codec can\'t decode byte. So its was a tgs file')
     except (TimeoutError, ClientConnectorError):
+        logger.warning(f'Retrying {url}')
         await asyncio.gather(crawl(url, session))
 
 
@@ -239,5 +231,15 @@ if __name__ == '__main__':
     asyncio.get_event_loop().run_until_complete(start(HIDDEN_URLS))
     logger.info(f'Stop crawling links. {time() - start_time} sec.')
 
+    try:
+        with open(OUTPUT_FILENAME, 'r') as f:
+            OLD_URL_LIST = set([l.replace('\n', '') for l in f.readlines()])
+
+        logger.info(f'Is equal: {OLD_URL_LIST == LINKS_TO_TRACK}')
+        logger.info(f'Deleted: {OLD_URL_LIST - LINKS_TO_TRACK}')
+        logger.info(f'Added: {LINKS_TO_TRACK - OLD_URL_LIST}')
+    except IOError:
+        pass
+
     with open(OUTPUT_FILENAME, 'w') as f:
         f.write('\n'.join(sorted(LINKS_TO_TRACK)))