telegram-crawler/make_tracked_links_list.py

import asyncio
import logging
import os
import re
from asyncio.exceptions import TimeoutError
from html import unescape
from time import time
from urllib.parse import unquote

import aiohttp
from aiohttp import ClientConnectorError, ServerDisconnectedError

PROTOCOL = 'https://'
BASE_URL = 'telegram.org'
# its necessary to help crawler to find more links
HIDDEN_URLS = {
    'corefork.telegram.org',
    'corefork.telegram.org/getProxyConfig',

    'telegram.org/privacy/gmailbot',
    'telegram.org/tos',
    'telegram.org/tour',
    'telegram.org/evolution',

    'desktop.telegram.org/changelog',

    'osx.telegram.org/updates/versions.xml',

    'instantview.telegram.org/rules',

    'core.telegram.org/resources/cidr.txt',
    'core.telegram.org/apple_privacy',
    'core.telegram.org/getProxyConfig',

    'promote.telegram.org',
    'contest.com',
}
ADDITIONAL_URLS = {
    'raw.githubusercontent.com/telegramdesktop/tdesktop/dev/Telegram/Resources/tl/mtproto.tl',
    'raw.githubusercontent.com/telegramdesktop/tdesktop/dev/Telegram/Resources/tl/api.tl',
    'raw.githubusercontent.com/tdlib/td/master/td/generate/scheme/telegram_api.tl',
    'raw.githubusercontent.com/tdlib/td/master/td/generate/scheme/td_api.tl',
}
BASE_URL_REGEX = r'telegram.org'

# disable crawling sub links for specific domains and url patterns
CRAWL_RULES = {
    # every rule is regex
    # empty string means match any url
    # allow rules with higher priority than deny
    'translations.telegram.org': {
        'allow': {
            r'^[^/]*$',  # root
            r'org/[^/]*/$',  # 1 lvl sub
            r'/en/[a-z_]+/$'  # 1 lvl after /en/
        },
        'deny': {
            '',  # all
        }
    },
    'bugs.telegram.org': {  # crawl first page of cards sorted by rating
        'deny': {
            # r'/c/[0-9]+/[0-9]+',  # disable comments
            '',
        },
    },
    'instantview.telegram.org': {
        'deny': {
            'file/',

            r'templates/.+',
            'samples/',
            'contest',
        },
    },
    'core.telegram.org': {
        'deny': {
            'file/',

            'bots/payments',

            'tdlib/docs/classtd',

            'validatedRequestedInfo',
        },
    },
    'corefork.telegram.org': {
        'deny': {
            'file/',

            'bots/payments',

            'tdlib/docs/classtd',

            'validatedRequestedInfo',
        },
    },
    'telegram.org': {
        'deny': {
            'file/',
            r'apps$'
        },
    },
    'webz.telegram.org': {
        'deny': {
            '',
        },
    },
    'webk.telegram.org': {
        'deny': {
            '',
        },
    },
}

DIRECT_LINK_REGEX = r'([-a-zA-Z0-9@:%._\+~#]{0,249}' + BASE_URL_REGEX + r')'
ABSOLUTE_LINK_REGEX = r'([-a-zA-Z0-9@:%._\+~#]{0,248}' + BASE_URL_REGEX + r'\b[-a-zA-Z0-9@:%_\+.~#?&//=]*)'
RELATIVE_LINK_REGEX = r'\/(?!\/)([-a-zA-Z0-9\/@:%._\+~#]{0,249})'

DOM_ATTRS = ['href', 'src']

OUTPUT_FILENAME = os.environ.get('OUTPUT_FILENAME', 'tracked_links.txt')
COMPARE_OUTPUT_WITH_FILENAME = os.environ.get('COMPARE_OUTPUT_WITH_FILENAME', OUTPUT_FILENAME)

# unsecure but so simple
CONNECTOR = aiohttp.TCPConnector(ssl=False)
TIMEOUT = aiohttp.ClientTimeout(total=30)

logging.basicConfig(format='%(message)s', level=logging.DEBUG)
logger = logging.getLogger(__name__)

VISITED_LINKS = set()
LINKS_TO_TRACK = set()


def should_exclude(url: str) -> bool:
    direct_link = re.findall(DIRECT_LINK_REGEX, url)[0]
    domain_rules = CRAWL_RULES.get(direct_link)
    if not domain_rules:
        return False

    allow_rules = domain_rules.get('allow', set())
    deny_rules = domain_rules.get('deny', set())

    exclude = False

    for regex in deny_rules:
        if re.search(regex, url):
            exclude = True
            break

    for regex in allow_rules:
        if re.search(regex, url):
            exclude = False
            break

    return exclude


def find_absolute_links(html: str) -> set[str]:
    absolute_links = set(re.findall(ABSOLUTE_LINK_REGEX, html))

    return {link for link in absolute_links if not should_exclude(link)}


def find_relative_links(html: str, cur_link: str) -> set[str]:
    matches = re.findall(DIRECT_LINK_REGEX, cur_link)
    if not matches:
        return set()

    direct_cur_link = re.findall(DIRECT_LINK_REGEX, cur_link)[0]
    # optimization. when we want to exclude domain
    if should_exclude(cur_link):
        return set()

    relative_links = set()
    for attr in DOM_ATTRS:
        regex = f'{attr}="{RELATIVE_LINK_REGEX}'
        links = re.findall(regex, html)

        for link in links:
            url = f'{direct_cur_link}/{link}'
            if not should_exclude(url):
                relative_links.add(url)

    return relative_links


def cleanup_links(links: set[str]) -> set[str]:
    cleaned_links = set()
    for tmp_link in links:
        # normalize link
        link = unquote(tmp_link)
        link = unescape(link)
        link = link.replace('www.', '')
        link = link.replace('http://', '').replace('https://', '')

        # skip anchor links
        if '#' in link:
            continue

        # remove get params from link
        if '?' in link:
            link = ''.join(link.split('?')[:-1])

        # skip mailto:
        link_parts = link.split('.')
        if '@' in link_parts[0]:
            continue

        cleaned_links.add(link)

    return cleaned_links


async def crawl(url: str, session: aiohttp.ClientSession):
    if url in VISITED_LINKS:
        return
    VISITED_LINKS.add(url)

    try:
        logger.info(f'[{len(VISITED_LINKS)}] Process {url}')
        async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response:
            content_type = response.headers.get('content-type')

            if response.status == 500:
                VISITED_LINKS.remove(url)
                return await asyncio.gather(crawl(url, session))

            if response.status not in {200, 304}:
                if response.status != 302:
                    content = await response.text(encoding='UTF-8')
                    logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}')
                return

            if 'text' in content_type:
                LINKS_TO_TRACK.add(url)

                html = await response.text(encoding='UTF-8')
                absolute_links = cleanup_links(find_absolute_links(html))
                relative_links = cleanup_links(find_relative_links(html, url))

                sub_links = absolute_links | relative_links
                await asyncio.gather(*[crawl(url, session) for url in sub_links])
            elif 'application/javascript' in content_type:
                LINKS_TO_TRACK.add(url)
            elif 'css' in content_type:
                LINKS_TO_TRACK.add(url)
            elif 'plain' in content_type:
                LINKS_TO_TRACK.add(url)
            elif 'application/json' in content_type:
                LINKS_TO_TRACK.add(url)
            else:
                # TODO track hashes of image/svg/video content types
                logger.info(f'Unhandled type: {content_type}')

            # telegram url can work with and without trailing slash (no redirect). P.S. not on every sub domain ;d
            # so this is a problem when we have random behavior with link will be added
            # this if resolve this issue. If available both link we prefer without trailing slash
            without_trailing_slash = url[:-1:] if url.endswith('/') else url
            if without_trailing_slash in LINKS_TO_TRACK and \
                    f'{without_trailing_slash}/' in LINKS_TO_TRACK:
                LINKS_TO_TRACK.remove(f'{without_trailing_slash}/')
    except UnicodeDecodeError:
        logger.warning('Codec can\'t decode byte. So its was a tgs file')
    except ClientConnectorError:
        logger.warning(f'Wrong link: {url}')
    except (ServerDisconnectedError, TimeoutError):
        logger.warning(f'Retrying {url}')
        VISITED_LINKS.remove(url)
        await asyncio.gather(crawl(url, session))


async def start(url_list: set[str]):
    async with aiohttp.ClientSession(connector=CONNECTOR) as session:
        await asyncio.gather(*[crawl(url, session) for url in url_list])


if __name__ == '__main__':
    HIDDEN_URLS.add(BASE_URL)
    LINKS_TO_TRACK = LINKS_TO_TRACK | ADDITIONAL_URLS

    logger.info('Start crawling links...')
    start_time = time()
    asyncio.get_event_loop().run_until_complete(start(HIDDEN_URLS))
    logger.info(f'Stop crawling links. {time() - start_time} sec.')

    try:
        with open(COMPARE_OUTPUT_WITH_FILENAME, 'r') as f:
            OLD_URL_LIST = set([l.replace('\n', '') for l in f.readlines()])

        logger.info(f'Is equal: {OLD_URL_LIST == LINKS_TO_TRACK}')
        logger.info(f'Deleted: {OLD_URL_LIST - LINKS_TO_TRACK}')
        logger.info(f'Added: {LINKS_TO_TRACK - OLD_URL_LIST}')
    except IOError:
        pass

    with open(OUTPUT_FILENAME, 'w') as f:
        f.write('\n'.join(sorted(LINKS_TO_TRACK)))
Initial commit 2021-04-23 20:34:42 +02:00			`import asyncio`
			`import logging`
print lead time 2021-04-23 22:49:21 +02:00			`import os`
Initial commit 2021-04-23 20:34:42 +02:00			`import re`
handle timeout and retry 2021-04-24 11:38:33 +02:00			`from asyncio.exceptions import TimeoutError`
Initial commit 2021-04-23 20:34:42 +02:00			`from html import unescape`
print lead time 2021-04-23 22:49:21 +02:00			`from time import time`
Initial commit 2021-04-23 20:34:42 +02:00			`from urllib.parse import unquote`

			`import aiohttp`
add logging of skips 2021-06-21 16:09:49 +02:00			`from aiohttp import ClientConnectorError, ServerDisconnectedError`
Initial commit 2021-04-23 20:34:42 +02:00
			`PROTOCOL = 'https://'`
			`BASE_URL = 'telegram.org'`
add ability to set hidden urls 2021-04-23 22:31:23 +02:00			`# its necessary to help crawler to find more links`
			`HIDDEN_URLS = {`
return tracking of corefork 2021-09-09 20:43:53 +02:00			`'corefork.telegram.org',`
add getProxyConfig 2021-10-01 19:18:27 +02:00			`'corefork.telegram.org/getProxyConfig',`
add ability to set hidden urls 2021-04-23 22:31:23 +02:00
			`'telegram.org/privacy/gmailbot',`
			`'telegram.org/tos',`
rework exclude system; add exclude by count of slashes; add translations.telegram.org 2021-04-24 00:06:50 +02:00			`'telegram.org/tour',`
move tracked links list to main branch; add generation of content tree; add workflow for tree generation; add new tg hidden link. 2021-04-24 14:19:01 +02:00			`'telegram.org/evolution',`
rework exclude system; add exclude by count of slashes; add translations.telegram.org 2021-04-24 00:06:50 +02:00
rewrite logic of rules system (not its uses regex); fix trailing slash; many attempt to send request; 2021-04-24 11:29:19 +02:00			`'desktop.telegram.org/changelog',`
exclude dynamic list of users 2021-05-05 12:58:54 +02:00
add 304 as success response; add handling of xml; add sparkle file of macOS client; fix request retrying with 500 status code; fix encoding. 2021-12-30 14:22:01 +01:00			`'osx.telegram.org/updates/versions.xml',`

exclude dynamic list of users 2021-05-05 12:58:54 +02:00			`'instantview.telegram.org/rules',`
add CIDR 2021-05-24 13:52:40 +02:00
			`'core.telegram.org/resources/cidr.txt',`
add apple privacy page 2021-05-28 20:43:45 +02:00			`'core.telegram.org/apple_privacy',`
add getProxyConfig 2021-10-01 19:18:27 +02:00			`'core.telegram.org/getProxyConfig',`
fix tracking of ad panel 2021-10-26 17:55:08 +02:00
			`'promote.telegram.org',`
			`'contest.com',`
add ability to set hidden urls 2021-04-23 22:31:23 +02:00			`}`
add additional urls section; add tracking of mtproto schema. 2021-06-01 08:57:10 +02:00			`ADDITIONAL_URLS = {`
add mtproto tl 2021-11-04 09:24:24 +01:00			`'raw.githubusercontent.com/telegramdesktop/tdesktop/dev/Telegram/Resources/tl/mtproto.tl',`
add contest.com 2021-06-01 09:03:35 +02:00			`'raw.githubusercontent.com/telegramdesktop/tdesktop/dev/Telegram/Resources/tl/api.tl',`
Added TDLib schema link 2021-08-31 14:18:25 +02:00			`'raw.githubusercontent.com/tdlib/td/master/td/generate/scheme/telegram_api.tl',`
add td_api of tdlib 2021-11-11 10:17:53 +01:00			`'raw.githubusercontent.com/tdlib/td/master/td/generate/scheme/td_api.tl',`
add additional urls section; add tracking of mtproto schema. 2021-06-01 08:57:10 +02:00			`}`
Initial commit 2021-04-23 20:34:42 +02:00			`BASE_URL_REGEX = r'telegram.org'`

rewrite logic of rules system (not its uses regex); fix trailing slash; many attempt to send request; 2021-04-24 11:29:19 +02:00			`# disable crawling sub links for specific domains and url patterns`
			`CRAWL_RULES = {`
			`# every rule is regex`
			`# empty string means match any url`
cleanup readme file; disable tests of link crawler. 2021-04-25 14:13:44 +02:00			`# allow rules with higher priority than deny`
Initial commit 2021-04-23 20:34:42 +02:00			`'translations.telegram.org': {`
rewrite logic of rules system (not its uses regex); fix trailing slash; many attempt to send request; 2021-04-24 11:29:19 +02:00			`'allow': {`
			`r'^[^/]*$', # root`
			`r'org/[^/]*/$', # 1 lvl sub`
			`r'/en/[a-z_]+/$' # 1 lvl after /en/`
			`},`
			`'deny': {`
			`'', # all`
rework exclude system; add exclude by count of slashes; add translations.telegram.org 2021-04-24 00:06:50 +02:00			`}`
Initial commit 2021-04-23 20:34:42 +02:00			`},`
rewrite logic of rules system (not its uses regex); fix trailing slash; many attempt to send request; 2021-04-24 11:29:19 +02:00			`'bugs.telegram.org': { # crawl first page of cards sorted by rating`
			`'deny': {`
disable crawling of bugs.telegram.org 2021-04-24 14:43:01 +02:00			`# r'/c/[0-9]+/[0-9]+', # disable comments`
			`'',`
rework exclude system; add exclude by count of slashes; add translations.telegram.org 2021-04-24 00:06:50 +02:00			`},`
Initial commit 2021-04-23 20:34:42 +02:00			`},`
			`'instantview.telegram.org': {`
rewrite logic of rules system (not its uses regex); fix trailing slash; many attempt to send request; 2021-04-24 11:29:19 +02:00			`'deny': {`
rework exclude system; add exclude by count of slashes; add translations.telegram.org 2021-04-24 00:06:50 +02:00			`'file/',`
Initial commit 2021-04-23 20:34:42 +02:00
rewrite logic of rules system (not its uses regex); fix trailing slash; many attempt to send request; 2021-04-24 11:29:19 +02:00			`r'templates/.+',`
rework exclude system; add exclude by count of slashes; add translations.telegram.org 2021-04-24 00:06:50 +02:00			`'samples/',`
exclude dynamic list of users 2021-05-05 12:58:54 +02:00			`'contest',`
rework exclude system; add exclude by count of slashes; add translations.telegram.org 2021-04-24 00:06:50 +02:00			`},`
Initial commit 2021-04-23 20:34:42 +02:00			`},`
			`'core.telegram.org': {`
rewrite logic of rules system (not its uses regex); fix trailing slash; many attempt to send request; 2021-04-24 11:29:19 +02:00			`'deny': {`
rework exclude system; add exclude by count of slashes; add translations.telegram.org 2021-04-24 00:06:50 +02:00			`'file/',`
Initial commit 2021-04-23 20:34:42 +02:00
exclude frequently updated pages 2021-05-01 15:10:20 +02:00			`'bots/payments',`

rewrite logic of rules system (not its uses regex); fix trailing slash; many attempt to send request; 2021-04-24 11:29:19 +02:00			`'tdlib/docs/classtd',`
temp exclude strange dynamic page 2021-05-01 16:28:02 +02:00
			`'validatedRequestedInfo',`
rework exclude system; add exclude by count of slashes; add translations.telegram.org 2021-04-24 00:06:50 +02:00			`},`
Initial commit 2021-04-23 20:34:42 +02:00			`},`
return tracking of corefork 2021-09-09 20:43:53 +02:00			`'corefork.telegram.org': {`
			`'deny': {`
			`'file/',`

			`'bots/payments',`

			`'tdlib/docs/classtd',`

			`'validatedRequestedInfo',`
			`},`
			`},`
Initial commit 2021-04-23 20:34:42 +02:00			`'telegram.org': {`
rewrite logic of rules system (not its uses regex); fix trailing slash; many attempt to send request; 2021-04-24 11:29:19 +02:00			`'deny': {`
rework exclude system; add exclude by count of slashes; add translations.telegram.org 2021-04-24 00:06:50 +02:00			`'file/',`
Revert "test alert system" This reverts commit fad0585d 2021-04-25 18:59:03 +02:00			`r'apps$'`
rework exclude system; add exclude by count of slashes; add translations.telegram.org 2021-04-24 00:06:50 +02:00			`},`
exclude frequently updated pages 2021-05-01 15:10:20 +02:00			`},`
			`'webz.telegram.org': {`
			`'deny': {`
			`'',`
			`},`
			`},`
			`'webk.telegram.org': {`
			`'deny': {`
			`'',`
			`},`
			`},`
Initial commit 2021-04-23 20:34:42 +02:00			`}`

			`DIRECT_LINK_REGEX = r'([-a-zA-Z0-9@:%._\+~#]{0,249}' + BASE_URL_REGEX + r')'`
fix, improve and cleanup links crawler; add diff or files 2021-04-24 22:40:25 +02:00			`ABSOLUTE_LINK_REGEX = r'([-a-zA-Z0-9@:%._\+~#]{0,248}' + BASE_URL_REGEX + r'\b[-a-zA-Z0-9@:%_\+.~#?&//=]*)'`
			`RELATIVE_LINK_REGEX = r'\/(?!\/)([-a-zA-Z0-9\/@:%._\+~#]{0,249})'`
Initial commit 2021-04-23 20:34:42 +02:00
			`DOM_ATTRS = ['href', 'src']`

the end. 2021-04-23 21:47:15 +02:00			`OUTPUT_FILENAME = os.environ.get('OUTPUT_FILENAME', 'tracked_links.txt')`
add comparing output to github ci logs 2021-06-27 23:06:29 +02:00			`COMPARE_OUTPUT_WITH_FILENAME = os.environ.get('COMPARE_OUTPUT_WITH_FILENAME', OUTPUT_FILENAME)`
Initial commit 2021-04-23 20:34:42 +02:00
			`# unsecure but so simple`
			`CONNECTOR = aiohttp.TCPConnector(ssl=False)`
rewrite logic of rules system (not its uses regex); fix trailing slash; many attempt to send request; 2021-04-24 11:29:19 +02:00			`TIMEOUT = aiohttp.ClientTimeout(total=30)`
Initial commit 2021-04-23 20:34:42 +02:00
			`logging.basicConfig(format='%(message)s', level=logging.DEBUG)`
			`logger = logging.getLogger(__name__)`

			`VISITED_LINKS = set()`
			`LINKS_TO_TRACK = set()`


rewrite logic of rules system (not its uses regex); fix trailing slash; many attempt to send request; 2021-04-24 11:29:19 +02:00			`def should_exclude(url: str) -> bool:`
			`direct_link = re.findall(DIRECT_LINK_REGEX, url)[0]`
			`domain_rules = CRAWL_RULES.get(direct_link)`
			`if not domain_rules:`
			`return False`
Initial commit 2021-04-23 20:34:42 +02:00
rewrite logic of rules system (not its uses regex); fix trailing slash; many attempt to send request; 2021-04-24 11:29:19 +02:00			`allow_rules = domain_rules.get('allow', set())`
			`deny_rules = domain_rules.get('deny', set())`
rework exclude system; add exclude by count of slashes; add translations.telegram.org 2021-04-24 00:06:50 +02:00
rewrite logic of rules system (not its uses regex); fix trailing slash; many attempt to send request; 2021-04-24 11:29:19 +02:00			`exclude = False`
rework exclude system; add exclude by count of slashes; add translations.telegram.org 2021-04-24 00:06:50 +02:00
rewrite logic of rules system (not its uses regex); fix trailing slash; many attempt to send request; 2021-04-24 11:29:19 +02:00			`for regex in deny_rules:`
			`if re.search(regex, url):`
			`exclude = True`
			`break`
Initial commit 2021-04-23 20:34:42 +02:00
rewrite logic of rules system (not its uses regex); fix trailing slash; many attempt to send request; 2021-04-24 11:29:19 +02:00			`for regex in allow_rules:`
			`if re.search(regex, url):`
			`exclude = False`
			`break`
Initial commit 2021-04-23 20:34:42 +02:00
rewrite logic of rules system (not its uses regex); fix trailing slash; many attempt to send request; 2021-04-24 11:29:19 +02:00			`return exclude`
Initial commit 2021-04-23 20:34:42 +02:00

rework exclude system; add exclude by count of slashes; add translations.telegram.org 2021-04-24 00:06:50 +02:00			`def find_absolute_links(html: str) -> set[str]:`
			`absolute_links = set(re.findall(ABSOLUTE_LINK_REGEX, html))`

			`return {link for link in absolute_links if not should_exclude(link)}`
Initial commit 2021-04-23 20:34:42 +02:00

			`def find_relative_links(html: str, cur_link: str) -> set[str]:`
fix tracking of ad panel 2021-10-26 17:55:08 +02:00			`matches = re.findall(DIRECT_LINK_REGEX, cur_link)`
			`if not matches:`
			`return set()`

Initial commit 2021-04-23 20:34:42 +02:00			`direct_cur_link = re.findall(DIRECT_LINK_REGEX, cur_link)[0]`
			`# optimization. when we want to exclude domain`
rework exclude system; add exclude by count of slashes; add translations.telegram.org 2021-04-24 00:06:50 +02:00			`if should_exclude(cur_link):`
Initial commit 2021-04-23 20:34:42 +02:00			`return set()`

			`relative_links = set()`
			`for attr in DOM_ATTRS:`
			`regex = f'{attr}="{RELATIVE_LINK_REGEX}'`
			`links = re.findall(regex, html)`

			`for link in links:`
fix, improve and cleanup links crawler; add diff or files 2021-04-24 22:40:25 +02:00			`url = f'{direct_cur_link}/{link}'`
			`if not should_exclude(url):`
			`relative_links.add(url)`
Initial commit 2021-04-23 20:34:42 +02:00
			`return relative_links`


			`def cleanup_links(links: set[str]) -> set[str]:`
			`cleaned_links = set()`
			`for tmp_link in links:`
			`# normalize link`
			`link = unquote(tmp_link)`
			`link = unescape(link)`
			`link = link.replace('www.', '')`
			`link = link.replace('http://', '').replace('https://', '')`
fix, improve and cleanup links crawler; add diff or files 2021-04-24 22:40:25 +02:00
Initial commit 2021-04-23 20:34:42 +02:00			`# skip anchor links`
			`if '#' in link:`
			`continue`

fix, improve and cleanup links crawler; add diff or files 2021-04-24 22:40:25 +02:00			`# remove get params from link`
			`if '?' in link:`
			`link = ''.join(link.split('?')[:-1])`

			`# skip mailto:`
Initial commit 2021-04-23 20:34:42 +02:00			`link_parts = link.split('.')`
			`if '@' in link_parts[0]:`
			`continue`

			`cleaned_links.add(link)`

			`return cleaned_links`


fix performance and memory issues 2021-04-23 22:38:54 +02:00			`async def crawl(url: str, session: aiohttp.ClientSession):`
fix, improve and cleanup links crawler; add diff or files 2021-04-24 22:40:25 +02:00			`if url in VISITED_LINKS:`
Initial commit 2021-04-23 20:34:42 +02:00			`return`
fix, improve and cleanup links crawler; add diff or files 2021-04-24 22:40:25 +02:00			`VISITED_LINKS.add(url)`
Initial commit 2021-04-23 20:34:42 +02:00
			`try:`
			`logger.info(f'[{len(VISITED_LINKS)}] Process {url}')`
fix resending of requests; fix timeout settings; fix handling of invalid urls; edit schedule of generation list to test. 2021-04-25 08:17:58 +02:00			`async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response:`
Initial commit 2021-04-23 20:34:42 +02:00			`content_type = response.headers.get('content-type')`

resend requests with server error response 2021-06-21 18:39:20 +02:00			`if response.status == 500:`
add 304 as success response; add handling of xml; add sparkle file of macOS client; fix request retrying with 500 status code; fix encoding. 2021-12-30 14:22:01 +01:00			`VISITED_LINKS.remove(url)`
resend requests with server error response 2021-06-21 18:39:20 +02:00			`return await asyncio.gather(crawl(url, session))`
add logging of skips 2021-06-21 16:09:49 +02:00
add 304 as success response; add handling of xml; add sparkle file of macOS client; fix request retrying with 500 status code; fix encoding. 2021-12-30 14:22:01 +01:00			`if response.status not in {200, 304}:`
resend requests with server error response 2021-06-21 18:39:20 +02:00			`if response.status != 302:`
add 304 as success response; add handling of xml; add sparkle file of macOS client; fix request retrying with 500 status code; fix encoding. 2021-12-30 14:22:01 +01:00			`content = await response.text(encoding='UTF-8')`
resend requests with server error response 2021-06-21 18:39:20 +02:00			`logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}')`
Initial commit 2021-04-23 20:34:42 +02:00			`return`

add 304 as success response; add handling of xml; add sparkle file of macOS client; fix request retrying with 500 status code; fix encoding. 2021-12-30 14:22:01 +01:00			`if 'text' in content_type:`
Initial commit 2021-04-23 20:34:42 +02:00			`LINKS_TO_TRACK.add(url)`

add 304 as success response; add handling of xml; add sparkle file of macOS client; fix request retrying with 500 status code; fix encoding. 2021-12-30 14:22:01 +01:00			`html = await response.text(encoding='UTF-8')`
Initial commit 2021-04-23 20:34:42 +02:00			`absolute_links = cleanup_links(find_absolute_links(html))`
			`relative_links = cleanup_links(find_relative_links(html, url))`

fix set operation 2021-04-23 20:44:17 +02:00			`sub_links = absolute_links \| relative_links`
fix performance and memory issues 2021-04-23 22:38:54 +02:00			`await asyncio.gather(*[crawl(url, session) for url in sub_links])`
Initial commit 2021-04-23 20:34:42 +02:00			`elif 'application/javascript' in content_type:`
			`LINKS_TO_TRACK.add(url)`
fix bug with tracking of plain text urls 2021-10-01 19:47:35 +02:00			`elif 'css' in content_type:`
Initial commit 2021-04-23 20:34:42 +02:00			`LINKS_TO_TRACK.add(url)`
fix bug with tracking of plain text urls 2021-10-01 19:47:35 +02:00			`elif 'plain' in content_type:`
add CIDR 2021-05-24 13:52:40 +02:00			`LINKS_TO_TRACK.add(url)`
Initial commit 2021-04-23 20:34:42 +02:00			`elif 'application/json' in content_type:`
			`LINKS_TO_TRACK.add(url)`
			`else:`
			`# TODO track hashes of image/svg/video content types`
			`logger.info(f'Unhandled type: {content_type}')`
fix, improve and cleanup links crawler; add diff or files 2021-04-24 22:40:25 +02:00
			`# telegram url can work with and without trailing slash (no redirect). P.S. not on every sub domain ;d`
			`# so this is a problem when we have random behavior with link will be added`
			`# this if resolve this issue. If available both link we prefer without trailing slash`
			`without_trailing_slash = url[:-1:] if url.endswith('/') else url`
			`if without_trailing_slash in LINKS_TO_TRACK and \`
			`f'{without_trailing_slash}/' in LINKS_TO_TRACK:`
			`LINKS_TO_TRACK.remove(f'{without_trailing_slash}/')`
rewrite logic of rules system (not its uses regex); fix trailing slash; many attempt to send request; 2021-04-24 11:29:19 +02:00			`except UnicodeDecodeError:`
			`logger.warning('Codec can\'t decode byte. So its was a tgs file')`
fix resending of requests; fix timeout settings; fix handling of invalid urls; edit schedule of generation list to test. 2021-04-25 08:17:58 +02:00			`except ClientConnectorError:`
			`logger.warning(f'Wrong link: {url}')`
add logging of skips 2021-06-21 16:09:49 +02:00			`except (ServerDisconnectedError, TimeoutError):`
fix, improve and cleanup links crawler; add diff or files 2021-04-24 22:40:25 +02:00			`logger.warning(f'Retrying {url}')`
fix resending of requests; fix timeout settings; fix handling of invalid urls; edit schedule of generation list to test. 2021-04-25 08:17:58 +02:00			`VISITED_LINKS.remove(url)`
rewrite logic of rules system (not its uses regex); fix trailing slash; many attempt to send request; 2021-04-24 11:29:19 +02:00			`await asyncio.gather(crawl(url, session))`
Initial commit 2021-04-23 20:34:42 +02:00

add ability to set hidden urls 2021-04-23 22:31:23 +02:00			`async def start(url_list: set[str]):`
fix performance and memory issues 2021-04-23 22:38:54 +02:00			`async with aiohttp.ClientSession(connector=CONNECTOR) as session:`
			`await asyncio.gather(*[crawl(url, session) for url in url_list])`
add ability to set hidden urls 2021-04-23 22:31:23 +02:00

Initial commit 2021-04-23 20:34:42 +02:00			`if __name__ == '__main__':`
add ability to set hidden urls 2021-04-23 22:31:23 +02:00			`HIDDEN_URLS.add(BASE_URL)`
add additional urls section; add tracking of mtproto schema. 2021-06-01 08:57:10 +02:00			`LINKS_TO_TRACK = LINKS_TO_TRACK \| ADDITIONAL_URLS`
add ability to set hidden urls 2021-04-23 22:31:23 +02:00
move tracked links list to main branch; add generation of content tree; add workflow for tree generation; add new tg hidden link. 2021-04-24 14:19:01 +02:00			`logger.info('Start crawling links...')`
print lead time 2021-04-23 22:49:21 +02:00			`start_time = time()`
add ability to set hidden urls 2021-04-23 22:31:23 +02:00			`asyncio.get_event_loop().run_until_complete(start(HIDDEN_URLS))`
move tracked links list to main branch; add generation of content tree; add workflow for tree generation; add new tg hidden link. 2021-04-24 14:19:01 +02:00			`logger.info(f'Stop crawling links. {time() - start_time} sec.')`
Initial commit 2021-04-23 20:34:42 +02:00
fix, improve and cleanup links crawler; add diff or files 2021-04-24 22:40:25 +02:00			`try:`
add comparing output to github ci logs 2021-06-27 23:06:29 +02:00			`with open(COMPARE_OUTPUT_WITH_FILENAME, 'r') as f:`
fix, improve and cleanup links crawler; add diff or files 2021-04-24 22:40:25 +02:00			`OLD_URL_LIST = set([l.replace('\n', '') for l in f.readlines()])`

			`logger.info(f'Is equal: {OLD_URL_LIST == LINKS_TO_TRACK}')`
			`logger.info(f'Deleted: {OLD_URL_LIST - LINKS_TO_TRACK}')`
			`logger.info(f'Added: {LINKS_TO_TRACK - OLD_URL_LIST}')`
			`except IOError:`
			`pass`

Initial commit 2021-04-23 20:34:42 +02:00			`with open(OUTPUT_FILENAME, 'w') as f:`
			`f.write('\n'.join(sorted(LINKS_TO_TRACK)))`