add 304 as success response;

add handling of xml; add sparkle file of macOS client; fix request retrying with 500 status code; fix encoding.
2025-01-05 18:38:19 +01:00 · 2021-12-30 14:22:01 +01:00 · 2021-12-30 14:22:01 +01:00 · 344e550bd9
commit 344e550bd9
parent 6f3d2bce6d
2 changed files with 9 additions and 6 deletions
--- a/make_files_tree.py
+++ b/make_files_tree.py
@ -43,7 +43,7 @@ async def crawl(url: str, session: aiohttp.ClientSession):
            if response.status == 500:
                return await asyncio.gather(crawl(url, session))

-            if response.status != 200:
+            if response.status not in {200, 304}:
                if response.status != 302:
                    content = await response.text()
                    logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}')
@ -58,7 +58,7 @@ async def crawl(url: str, session: aiohttp.ClientSession):

            os.makedirs(os.path.dirname(filename), exist_ok=True)
            async with aiofiles.open(filename, 'w') as f:
-                content = await response.text()
+                content = await response.text(encoding='UTF-8')
                content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
                content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content)
                content = re.sub(PASSPORT_SSID_REGEX, PASSPORT_SSID_TEMPLATE, content)
--- a/make_tracked_links_list.py
+++ b/make_tracked_links_list.py
@ -24,6 +24,8 @@ HIDDEN_URLS = {

    'desktop.telegram.org/changelog',

+    'osx.telegram.org/updates/versions.xml',
+
    'instantview.telegram.org/rules',

    'core.telegram.org/resources/cidr.txt',
@ -222,18 +224,19 @@ async def crawl(url: str, session: aiohttp.ClientSession):
            content_type = response.headers.get('content-type')

            if response.status == 500:
+                VISITED_LINKS.remove(url)
                return await asyncio.gather(crawl(url, session))

-            if response.status != 200:
+            if response.status not in {200, 304}:
                if response.status != 302:
-                    content = await response.text()
+                    content = await response.text(encoding='UTF-8')
                    logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}')
                return

-            if 'text/html' in content_type:
+            if 'text' in content_type:
                LINKS_TO_TRACK.add(url)

-                html = await response.text()
+                html = await response.text(encoding='UTF-8')
                absolute_links = cleanup_links(find_absolute_links(html))
                relative_links = cleanup_links(find_relative_links(html, url))