telegram-crawler/make_tracked_links_list.py

244 lines
7.1 KiB
Python
Raw Normal View History

2021-04-23 20:34:42 +02:00
import asyncio
import logging
2021-04-23 22:49:21 +02:00
import os
2021-04-23 20:34:42 +02:00
import re
2021-04-24 11:38:33 +02:00
from asyncio.exceptions import TimeoutError
2021-04-23 20:34:42 +02:00
from html import unescape
2021-04-23 22:49:21 +02:00
from time import time
2021-04-23 20:34:42 +02:00
from urllib.parse import unquote
import aiohttp
from aiohttp import ClientConnectorError
2021-04-23 20:34:42 +02:00
PROTOCOL = 'https://'
BASE_URL = 'telegram.org'
2021-04-23 22:31:23 +02:00
# its necessary to help crawler to find more links
HIDDEN_URLS = {
# 'corefork.telegram.org',
2021-04-23 22:31:23 +02:00
'telegram.org/privacy/gmailbot',
'telegram.org/tos',
'telegram.org/tour',
'telegram.org/evolution',
'desktop.telegram.org/changelog',
2021-04-23 22:31:23 +02:00
}
2021-04-23 20:34:42 +02:00
BASE_URL_REGEX = r'telegram.org'
# disable crawling sub links for specific domains and url patterns
CRAWL_RULES = {
# every rule is regex
# empty string means match any url
# allow rules with high priority than deny
2021-04-23 20:34:42 +02:00
'translations.telegram.org': {
'allow': {
r'^[^/]*$', # root
r'org/[^/]*/$', # 1 lvl sub
r'/en/[a-z_]+/$' # 1 lvl after /en/
},
'deny': {
'', # all
}
2021-04-23 20:34:42 +02:00
},
'bugs.telegram.org': { # crawl first page of cards sorted by rating
'deny': {
2021-04-24 14:43:01 +02:00
# r'/c/[0-9]+/[0-9]+', # disable comments
'',
},
2021-04-23 20:34:42 +02:00
},
'instantview.telegram.org': {
'allow': {
'contest/winners'
},
'deny': {
'file/',
2021-04-23 20:34:42 +02:00
r'templates/.+',
'samples/',
'contest/',
},
2021-04-23 20:34:42 +02:00
},
'core.telegram.org': {
'deny': {
'file/',
2021-04-23 20:34:42 +02:00
'tdlib/docs/classtd',
2021-04-23 20:34:42 +02:00
'constructor/',
'method/',
'type/',
},
2021-04-23 20:34:42 +02:00
},
'telegram.org': {
'deny': {
'file/',
},
2021-04-23 20:34:42 +02:00
}
}
DIRECT_LINK_REGEX = r'([-a-zA-Z0-9@:%._\+~#]{0,249}' + BASE_URL_REGEX + r')'
ABSOLUTE_LINK_REGEX = r'([-a-zA-Z0-9@:%._\+~#]{1,249}' + BASE_URL_REGEX + r'\b[-a-zA-Z0-9@:%_\+.~#?&\/\/=]*)'
RELATIVE_LINK_REGEX = r'\/([-a-zA-Z0-9\/@:%._\+~#]{0,249})'
DOM_ATTRS = ['href', 'src']
2021-04-23 21:47:15 +02:00
OUTPUT_FILENAME = os.environ.get('OUTPUT_FILENAME', 'tracked_links.txt')
2021-04-23 20:34:42 +02:00
# unsecure but so simple
CONNECTOR = aiohttp.TCPConnector(ssl=False)
TIMEOUT = aiohttp.ClientTimeout(total=30)
2021-04-23 20:34:42 +02:00
logging.basicConfig(format='%(message)s', level=logging.DEBUG)
logger = logging.getLogger(__name__)
VISITED_LINKS = set()
LINKS_TO_TRACK = set()
def should_exclude(url: str) -> bool:
direct_link = re.findall(DIRECT_LINK_REGEX, url)[0]
domain_rules = CRAWL_RULES.get(direct_link)
if not domain_rules:
return False
2021-04-23 20:34:42 +02:00
allow_rules = domain_rules.get('allow', set())
deny_rules = domain_rules.get('deny', set())
exclude = False
for regex in deny_rules:
if re.search(regex, url):
exclude = True
break
2021-04-23 20:34:42 +02:00
for regex in allow_rules:
if re.search(regex, url):
exclude = False
break
2021-04-23 20:34:42 +02:00
return exclude
2021-04-23 20:34:42 +02:00
def find_absolute_links(html: str) -> set[str]:
absolute_links = set(re.findall(ABSOLUTE_LINK_REGEX, html))
return {link for link in absolute_links if not should_exclude(link)}
2021-04-23 20:34:42 +02:00
def find_relative_links(html: str, cur_link: str) -> set[str]:
direct_cur_link = re.findall(DIRECT_LINK_REGEX, cur_link)[0]
# optimization. when we want to exclude domain
if should_exclude(cur_link):
2021-04-23 20:34:42 +02:00
return set()
relative_links = set()
for attr in DOM_ATTRS:
regex = f'{attr}="{RELATIVE_LINK_REGEX}'
links = re.findall(regex, html)
for link in links:
# bypass //www.apple and etc shit ;d
if link.startswith('/'):
# absolute links starting with double slash
if find_absolute_links(link):
if not should_exclude(link[1::]):
2021-04-23 20:34:42 +02:00
relative_links.add(link[1::])
else:
url = f'{direct_cur_link}/{link}'
if not should_exclude(url):
relative_links.add(url)
2021-04-23 20:34:42 +02:00
return relative_links
def cleanup_links(links: set[str]) -> set[str]:
cleaned_links = set()
for tmp_link in links:
# normalize link
link = unquote(tmp_link)
link = unescape(link)
link = link.replace('www.', '')
link = link.replace('http://', '').replace('https://', '')
# skip anchor links
if '#' in link:
continue
link_parts = link.split('.')
if '@' in link_parts[0]:
continue
cleaned_links.add(link)
return cleaned_links
2021-04-23 22:38:54 +02:00
async def crawl(url: str, session: aiohttp.ClientSession):
# todo
if url.endswith('.'):
return
without_trailing_slash = url[:-1:] if url.endswith('/') else url
if without_trailing_slash in VISITED_LINKS:
2021-04-23 20:34:42 +02:00
return
VISITED_LINKS.add(without_trailing_slash)
2021-04-23 20:34:42 +02:00
try:
logger.info(f'[{len(VISITED_LINKS)}] Process {url}')
2021-04-23 22:38:54 +02:00
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False) as response:
2021-04-23 20:34:42 +02:00
status_code = response.status
content_type = response.headers.get('content-type')
# if it was redirect to link with trailing slash - handle this url
if 300 < status_code < 400:
location = response.headers.get('location', '')
# todo rewrite logic
if without_trailing_slash in location:
if not should_exclude(location):
# nice shit bro
logger.info(f'Trailing slash. {location}')
cleaned_link = list(cleanup_links({location}))[0]
await asyncio.gather(crawl(cleaned_link, session))
2021-04-23 20:34:42 +02:00
if status_code != 200:
return
if 'text/html' in content_type:
LINKS_TO_TRACK.add(url)
html = await response.text()
absolute_links = cleanup_links(find_absolute_links(html))
relative_links = cleanup_links(find_relative_links(html, url))
2021-04-23 20:44:17 +02:00
sub_links = absolute_links | relative_links
2021-04-23 22:38:54 +02:00
await asyncio.gather(*[crawl(url, session) for url in sub_links])
2021-04-23 20:34:42 +02:00
elif 'application/javascript' in content_type:
LINKS_TO_TRACK.add(url)
elif 'text/css' in content_type:
LINKS_TO_TRACK.add(url)
elif 'application/json' in content_type:
LINKS_TO_TRACK.add(url)
else:
# TODO track hashes of image/svg/video content types
logger.info(f'Unhandled type: {content_type}')
except UnicodeDecodeError:
logger.warning('Codec can\'t decode byte. So its was a tgs file')
2021-04-24 11:38:33 +02:00
except (TimeoutError, ClientConnectorError):
await asyncio.gather(crawl(url, session))
2021-04-23 20:34:42 +02:00
2021-04-23 22:31:23 +02:00
async def start(url_list: set[str]):
2021-04-23 22:38:54 +02:00
async with aiohttp.ClientSession(connector=CONNECTOR) as session:
await asyncio.gather(*[crawl(url, session) for url in url_list])
2021-04-23 22:31:23 +02:00
2021-04-23 20:34:42 +02:00
if __name__ == '__main__':
2021-04-23 22:31:23 +02:00
HIDDEN_URLS.add(BASE_URL)
logger.info('Start crawling links...')
2021-04-23 22:49:21 +02:00
start_time = time()
2021-04-23 22:31:23 +02:00
asyncio.get_event_loop().run_until_complete(start(HIDDEN_URLS))
logger.info(f'Stop crawling links. {time() - start_time} sec.')
2021-04-23 20:34:42 +02:00
with open(OUTPUT_FILENAME, 'w') as f:
f.write('\n'.join(sorted(LINKS_TO_TRACK)))