telegram-crawler/make_tracked_links_list.py

363 lines
11 KiB
Python
Raw Normal View History

2021-04-23 20:34:42 +02:00
import asyncio
import logging
2021-04-23 22:49:21 +02:00
import os
2021-04-23 20:34:42 +02:00
import re
2021-04-24 11:38:33 +02:00
from asyncio.exceptions import TimeoutError
2021-04-23 20:34:42 +02:00
from html import unescape
2021-04-23 22:49:21 +02:00
from time import time
2021-04-23 20:34:42 +02:00
from urllib.parse import unquote
import aiohttp
2021-06-21 16:09:49 +02:00
from aiohttp import ClientConnectorError, ServerDisconnectedError
2021-04-23 20:34:42 +02:00
PROTOCOL = 'https://'
BASE_URL = 'telegram.org'
2022-04-10 17:46:36 +02:00
# it's necessary to help crawler to find more links
2021-04-23 22:31:23 +02:00
HIDDEN_URLS = {
2021-09-09 20:43:53 +02:00
'corefork.telegram.org',
2021-10-01 19:18:27 +02:00
'corefork.telegram.org/getProxyConfig',
2021-04-23 22:31:23 +02:00
'telegram.org/privacy/gmailbot',
'telegram.org/tos',
'telegram.org/tour',
'telegram.org/evolution',
'desktop.telegram.org/changelog',
2022-04-17 23:31:44 +02:00
'td.telegram.org/current',
'td.telegram.org/current2',
2021-05-05 12:58:54 +02:00
'osx.telegram.org/updates/versions.xml', # stable
'api.appcenter.ms/v0.1/public/sparkle/apps/6ed2ac30-49e1-4073-87c2-f1ffcb74e81f', # beta
2021-05-05 12:58:54 +02:00
'instantview.telegram.org/rules',
2021-05-24 13:52:40 +02:00
'core.telegram.org/resources/cidr.txt',
2021-05-28 20:43:45 +02:00
'core.telegram.org/apple_privacy',
2021-10-01 19:18:27 +02:00
'core.telegram.org/getProxyConfig',
2022-01-31 19:22:18 +01:00
2022-01-29 12:02:16 +01:00
'core.telegram.org/video_stickers',
2022-01-31 19:22:18 +01:00
'core.telegram.org/stickers',
2021-10-26 17:55:08 +02:00
'promote.telegram.org',
'contest.com',
2022-04-07 08:16:37 +02:00
# web apps beta
'comments.app/test_webview', # old
'webappcontent.telegram.org/demo', # new
2022-04-09 14:21:04 +02:00
'webappcontent.telegram.org/cafe', # demo 2
2022-04-24 09:43:32 +02:00
# 'a-webappcontent.stel.com/demo',
# 'a-webappcontent.stel.com/cafe',
2021-04-23 22:31:23 +02:00
}
ADDITIONAL_URLS = {
2021-11-04 09:24:24 +01:00
'raw.githubusercontent.com/telegramdesktop/tdesktop/dev/Telegram/Resources/tl/mtproto.tl',
2021-06-01 09:03:35 +02:00
'raw.githubusercontent.com/telegramdesktop/tdesktop/dev/Telegram/Resources/tl/api.tl',
2021-08-31 14:18:25 +02:00
'raw.githubusercontent.com/tdlib/td/master/td/generate/scheme/telegram_api.tl',
2021-11-11 10:17:53 +01:00
'raw.githubusercontent.com/tdlib/td/master/td/generate/scheme/td_api.tl',
}
2021-04-23 20:34:42 +02:00
BASE_URL_REGEX = r'telegram.org'
# disable crawling sub links for specific domains and url patterns
CRAWL_RULES = {
# every rule is regex
# empty string means match any url
# allow rules with higher priority than deny
2021-04-23 20:34:42 +02:00
'translations.telegram.org': {
'allow': {
r'^[^/]*$', # root
r'org/[^/]*/$', # 1 lvl sub
2022-03-30 15:09:50 +02:00
r'/css/[a-z-_.]+$', # css files
r'/en/[a-z_]+/$', # 1 lvl after /en/
r'/en/[a-z_]+/[a-z_]+/$', # 2 lvl after /en/. for example, /en/ios/unsorted/
},
'deny': {
'', # all
}
2021-04-23 20:34:42 +02:00
},
2022-04-18 00:11:21 +02:00
'osx.telegram.org': {
'deny': {
'updates/Telegram'
}
},
'bugs.telegram.org': { # crawl first page of cards sorted by rating
'deny': {
2021-04-24 14:43:01 +02:00
# r'/c/[0-9]+/[0-9]+', # disable comments
'',
},
2021-04-23 20:34:42 +02:00
},
'instantview.telegram.org': {
'deny': {
r'templates/.+',
'samples/',
2021-05-05 12:58:54 +02:00
'contest',
},
2021-04-23 20:34:42 +02:00
},
'core.telegram.org': {
'deny': {
2021-05-01 15:10:20 +02:00
'bots/payments',
'tdlib/docs/classtd',
2021-05-01 16:28:02 +02:00
'validatedRequestedInfo',
},
2021-04-23 20:34:42 +02:00
},
2021-09-09 20:43:53 +02:00
'corefork.telegram.org': {
'deny': {
'bots/payments',
'tdlib/docs/classtd',
'validatedRequestedInfo',
},
},
2021-04-23 20:34:42 +02:00
'telegram.org': {
'deny': {
2022-04-10 17:46:36 +02:00
r'apps$',
2022-04-17 23:31:44 +02:00
r'img/StickerExample.psd$',
},
2021-05-01 15:10:20 +02:00
},
'webz.telegram.org': {
'deny': {
'',
},
},
'webk.telegram.org': {
'deny': {
'',
},
},
2021-04-23 20:34:42 +02:00
}
DIRECT_LINK_REGEX = r'([-a-zA-Z0-9@:%._\+~#]{0,249}' + BASE_URL_REGEX + r')'
ABSOLUTE_LINK_REGEX = r'([-a-zA-Z0-9@:%._\+~#]{0,248}' + BASE_URL_REGEX + r'\b[-a-zA-Z0-9@:%_\+.~#?&//=]*)'
RELATIVE_LINK_REGEX = r'\/(?!\/)([-a-zA-Z0-9\/@:%._\+~#]{0,249})'
RELATIVE_JS_SCRIPTS_REGEX = r'["\'](.*\.js)["\'\?]'
2021-04-23 20:34:42 +02:00
DOM_ATTRS = ['href', 'src']
2021-04-23 21:47:15 +02:00
OUTPUT_FILENAME = os.environ.get('OUTPUT_FILENAME', 'tracked_links.txt')
2021-06-27 23:06:29 +02:00
COMPARE_OUTPUT_WITH_FILENAME = os.environ.get('COMPARE_OUTPUT_WITH_FILENAME', OUTPUT_FILENAME)
2021-04-23 20:34:42 +02:00
# unsecure but so simple
CONNECTOR = aiohttp.TCPConnector(ssl=False)
TIMEOUT = aiohttp.ClientTimeout(total=10)
2021-04-23 20:34:42 +02:00
logging.basicConfig(format='%(message)s', level=logging.DEBUG)
logger = logging.getLogger(__name__)
VISITED_LINKS = set()
LINKS_TO_TRACK = set()
def should_exclude(url: str) -> bool:
direct_link = re.findall(DIRECT_LINK_REGEX, url)[0]
domain_rules = CRAWL_RULES.get(direct_link)
if not domain_rules:
return False
2021-04-23 20:34:42 +02:00
allow_rules = domain_rules.get('allow', set())
deny_rules = domain_rules.get('deny', set())
exclude = False
for regex in deny_rules:
if re.search(regex, url):
exclude = True
break
2021-04-23 20:34:42 +02:00
for regex in allow_rules:
if re.search(regex, url):
exclude = False
break
2021-04-23 20:34:42 +02:00
return exclude
2021-04-23 20:34:42 +02:00
def find_absolute_links(html: str) -> set[str]:
absolute_links = set(re.findall(ABSOLUTE_LINK_REGEX, html))
return {link for link in absolute_links if not should_exclude(link)}
2021-04-23 20:34:42 +02:00
def find_relative_links(html: str, cur_link: str) -> set[str]:
2021-10-26 17:55:08 +02:00
matches = re.findall(DIRECT_LINK_REGEX, cur_link)
if not matches:
return set()
2021-04-23 20:34:42 +02:00
direct_cur_link = re.findall(DIRECT_LINK_REGEX, cur_link)[0]
# optimization. when we want to exclude domain
if should_exclude(cur_link):
2021-04-23 20:34:42 +02:00
return set()
relative_links = set()
for attr in DOM_ATTRS:
regex = f'{attr}="{RELATIVE_LINK_REGEX}'
links = re.findall(regex, html)
for link in links:
url = f'{direct_cur_link}/{link}'
if not should_exclude(url):
relative_links.add(url)
2021-04-23 20:34:42 +02:00
return relative_links
def find_relative_scripts(code: str, cur_link: str) -> set[str]:
matches = re.findall(DIRECT_LINK_REGEX, cur_link)
if not matches:
return set()
direct_cur_link = re.findall(DIRECT_LINK_REGEX, cur_link)[0]
relative_links = set()
for link in re.findall(RELATIVE_JS_SCRIPTS_REGEX, code):
# dirty magic for specific cases
if '/' in link: # path to file from the root
url = f'{direct_cur_link}/{link}'
else: # its relative link from current folder. Not from the root
current_folder_link, *_ = cur_link.rsplit('/', 1)
url = f'{current_folder_link}/{link}'
if not should_exclude(url):
relative_links.add(url)
return relative_links
2021-04-23 20:34:42 +02:00
def cleanup_links(links: set[str]) -> set[str]:
cleaned_links = set()
for tmp_link in links:
# normalize link
link = unquote(tmp_link)
link = unescape(link)
link = link.replace('www.', '')
link = link.replace('http://', '').replace('https://', '')
link = link.replace('//', '/') # not a universal solution
2022-04-17 23:31:44 +02:00
link = link.replace('"', '') # regex fix hack
2021-04-23 20:34:42 +02:00
# skip anchor links
if '#' in link:
continue
# remove get params from link
if '?' in link:
link = ''.join(link.split('?')[:-1])
# skip mailto:
2021-04-23 20:34:42 +02:00
link_parts = link.split('.')
if '@' in link_parts[0]:
continue
cleaned_links.add(link)
return cleaned_links
2022-04-10 17:46:36 +02:00
def is_trackable_content_type(content_type) -> bool:
trackable_content_types = (
'css',
'plain',
'json',
'svg',
'png',
'jpeg',
'x-icon',
'gif',
'mp4',
'webm',
2022-04-17 23:31:44 +02:00
'application/octet-stream', # td updates
2022-04-24 09:02:48 +02:00
'application/zip',
2022-04-10 17:46:36 +02:00
)
for trackable_content_type in trackable_content_types:
if trackable_content_type in content_type:
return True
return False
2021-04-23 22:38:54 +02:00
async def crawl(url: str, session: aiohttp.ClientSession):
if url in VISITED_LINKS:
2021-04-23 20:34:42 +02:00
return
VISITED_LINKS.add(url)
2021-04-23 20:34:42 +02:00
try:
logger.info(f'[{len(VISITED_LINKS)}] Process {url}')
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response:
2021-04-23 20:34:42 +02:00
content_type = response.headers.get('content-type')
2022-02-24 21:58:17 +01:00
if response.status // 100 == 5:
VISITED_LINKS.remove(url)
2022-02-24 21:58:17 +01:00
logger.warning(f'Error 5XX. Retrying {url}')
2022-04-18 00:11:21 +02:00
return await crawl(url, session)
2021-06-21 16:09:49 +02:00
if response.status not in {200, 304}:
if response.status != 302:
content = await response.text(encoding='UTF-8')
logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}')
2021-04-23 20:34:42 +02:00
return
if 'text' in content_type or 'javascript' in content_type:
2021-04-23 20:34:42 +02:00
LINKS_TO_TRACK.add(url)
content = await response.text(encoding='UTF-8')
absolute_links = cleanup_links(find_absolute_links(content))
relative_links_finder = find_relative_links
if 'javascript' in content_type:
relative_links_finder = find_relative_scripts
relative_links = cleanup_links(relative_links_finder(content, url))
2021-04-23 20:34:42 +02:00
2021-04-23 20:44:17 +02:00
sub_links = absolute_links | relative_links
2021-04-23 22:38:54 +02:00
await asyncio.gather(*[crawl(url, session) for url in sub_links])
2022-04-10 17:46:36 +02:00
elif is_trackable_content_type(content_type):
2021-04-23 20:34:42 +02:00
LINKS_TO_TRACK.add(url)
else:
2022-04-10 17:46:36 +02:00
# for example, zip with update of macOS client
logger.info(f'Unhandled type: {content_type} from {url}')
2022-04-10 17:46:36 +02:00
# telegram url can work with and without trailing slash (no redirect). P.S. not on every subdomain ;d
# so this is a problem when we have random behavior with link will be added
# this if resolve this issue. If available both link we prefer without trailing slash
without_trailing_slash = url[:-1:] if url.endswith('/') else url
if without_trailing_slash in LINKS_TO_TRACK and \
f'{without_trailing_slash}/' in LINKS_TO_TRACK:
LINKS_TO_TRACK.remove(f'{without_trailing_slash}/')
except UnicodeDecodeError:
2022-04-10 17:46:36 +02:00
logger.warning(f'Codec can\'t decode bytes. So it was a tgs file or response with broken content type {url}')
2022-04-17 23:31:44 +02:00
# except ClientConnectorError:
# logger.warning(f'Wrong link: {url}')
except (ServerDisconnectedError, TimeoutError, ClientConnectorError):
2022-02-24 21:58:17 +01:00
logger.warning(f'Client or timeout error. Retrying {url}')
VISITED_LINKS.remove(url)
2022-04-17 23:31:44 +02:00
# sleep + count of attempts?
2022-04-18 00:11:21 +02:00
await crawl(url, session)
2021-04-23 20:34:42 +02:00
2021-04-23 22:31:23 +02:00
async def start(url_list: set[str]):
2021-04-23 22:38:54 +02:00
async with aiohttp.ClientSession(connector=CONNECTOR) as session:
await asyncio.gather(*[crawl(url, session) for url in url_list])
2021-04-23 22:31:23 +02:00
2021-04-23 20:34:42 +02:00
if __name__ == '__main__':
2021-04-23 22:31:23 +02:00
HIDDEN_URLS.add(BASE_URL)
LINKS_TO_TRACK = LINKS_TO_TRACK | ADDITIONAL_URLS
2021-04-23 22:31:23 +02:00
logger.info('Start crawling links...')
2021-04-23 22:49:21 +02:00
start_time = time()
2021-04-23 22:31:23 +02:00
asyncio.get_event_loop().run_until_complete(start(HIDDEN_URLS))
logger.info(f'Stop crawling links. {time() - start_time} sec.')
2021-04-23 20:34:42 +02:00
try:
2021-06-27 23:06:29 +02:00
with open(COMPARE_OUTPUT_WITH_FILENAME, 'r') as f:
OLD_URL_LIST = set([l.replace('\n', '') for l in f.readlines()])
logger.info(f'Is equal: {OLD_URL_LIST == LINKS_TO_TRACK}')
logger.info(f'Deleted: {OLD_URL_LIST - LINKS_TO_TRACK}')
logger.info(f'Added: {LINKS_TO_TRACK - OLD_URL_LIST}')
except IOError:
pass
2021-04-23 20:34:42 +02:00
with open(OUTPUT_FILENAME, 'w') as f:
f.write('\n'.join(sorted(LINKS_TO_TRACK)))