fix .html file type

This commit is contained in:
Ilya (Marshal) 2023-10-21 19:25:38 +02:00
parent 9f4fc50277
commit 2e6b22f72f
No known key found for this signature in database
GPG key ID: F8FB1A006AD3156D

View file

@ -2,6 +2,7 @@ import asyncio
import hashlib
import json
import logging
import mimetypes
import os
import platform
import random
@ -709,8 +710,12 @@ async def _crawl(url: str, session: aiohttp.ClientSession, output_dir: str):
# sometimes it returns correct type. noice load balancing
is_sucking_file = '/file/' in url and 'text' in response.content_type
# handle pure domains and html pages without ext in url
ext = '.html' if '.' not in url_parts[-1] or len(url_parts) == 1 else ''
# handle pure domains and html pages without ext in url as html do enable syntax highlighting
page_type, _ = mimetypes.guess_type(url)
if url.endswith('.tl'):
page_type = 'text/plain'
ext = '.html' if page_type is None or len(url_parts) == 1 else ''
# I don't add ext by content type for images and so on cuz TG servers sucks.
# Some servers do not return correct content type. Some servers do...