2021-04-23 20:34:42 +02:00
|
|
|
import asyncio
|
|
|
|
import logging
|
2021-04-23 22:49:21 +02:00
|
|
|
import os
|
2021-04-23 20:34:42 +02:00
|
|
|
import re
|
2024-06-02 16:05:39 +02:00
|
|
|
from asyncio import Queue
|
2021-04-24 11:38:33 +02:00
|
|
|
from asyncio.exceptions import TimeoutError
|
2024-06-02 16:05:39 +02:00
|
|
|
from functools import cache
|
2021-04-23 20:34:42 +02:00
|
|
|
from html import unescape
|
2021-04-23 22:49:21 +02:00
|
|
|
from time import time
|
2023-10-21 20:04:37 +02:00
|
|
|
from typing import Set
|
2021-04-23 20:34:42 +02:00
|
|
|
from urllib.parse import unquote
|
|
|
|
|
|
|
|
import aiohttp
|
2021-06-21 16:09:49 +02:00
|
|
|
from aiohttp import ClientConnectorError, ServerDisconnectedError
|
2021-04-23 20:34:42 +02:00
|
|
|
|
2024-06-02 16:05:39 +02:00
|
|
|
|
2021-04-23 20:34:42 +02:00
|
|
|
PROTOCOL = 'https://'
|
|
|
|
BASE_URL = 'telegram.org'
|
2022-04-10 17:46:36 +02:00
|
|
|
# it's necessary to help crawler to find more links
|
2021-04-23 22:31:23 +02:00
|
|
|
HIDDEN_URLS = {
|
2022-05-14 00:35:40 +02:00
|
|
|
'blogfork.telegram.org',
|
|
|
|
|
2021-09-09 20:43:53 +02:00
|
|
|
'corefork.telegram.org',
|
2021-10-01 19:18:27 +02:00
|
|
|
'corefork.telegram.org/getProxyConfig',
|
2021-04-23 22:31:23 +02:00
|
|
|
|
|
|
|
'telegram.org/privacy/gmailbot',
|
2023-09-10 23:16:19 +02:00
|
|
|
'telegram.org/tos/mini-apps',
|
2024-02-20 08:46:22 +01:00
|
|
|
'telegram.org/tos/p2pl',
|
2021-04-24 00:06:50 +02:00
|
|
|
'telegram.org/tour',
|
2021-04-24 14:19:01 +02:00
|
|
|
'telegram.org/evolution',
|
2024-06-02 16:05:39 +02:00
|
|
|
'telegram.org/tos/bots',
|
|
|
|
'telegram.org/tos/business',
|
2021-04-24 00:06:50 +02:00
|
|
|
|
2021-04-24 11:29:19 +02:00
|
|
|
'desktop.telegram.org/changelog',
|
2022-04-17 23:31:44 +02:00
|
|
|
'td.telegram.org/current',
|
|
|
|
'td.telegram.org/current2',
|
2024-08-31 21:37:48 +02:00
|
|
|
'td.telegram.org/current4',
|
2023-04-28 23:08:34 +02:00
|
|
|
'td.telegram.org/current5', # tdx
|
2021-05-05 12:58:54 +02:00
|
|
|
|
2022-03-30 09:19:53 +02:00
|
|
|
'osx.telegram.org/updates/versions.xml', # stable
|
|
|
|
'api.appcenter.ms/v0.1/public/sparkle/apps/6ed2ac30-49e1-4073-87c2-f1ffcb74e81f', # beta
|
2021-12-30 14:22:01 +01:00
|
|
|
|
2021-05-05 12:58:54 +02:00
|
|
|
'instantview.telegram.org/rules',
|
2021-05-24 13:52:40 +02:00
|
|
|
|
|
|
|
'core.telegram.org/resources/cidr.txt',
|
2021-05-28 20:43:45 +02:00
|
|
|
'core.telegram.org/apple_privacy',
|
2021-10-01 19:18:27 +02:00
|
|
|
'core.telegram.org/getProxyConfig',
|
2022-01-31 19:22:18 +01:00
|
|
|
|
2022-01-29 12:02:16 +01:00
|
|
|
'core.telegram.org/video_stickers',
|
2022-01-31 19:22:18 +01:00
|
|
|
'core.telegram.org/stickers',
|
|
|
|
|
2021-10-26 17:55:08 +02:00
|
|
|
'promote.telegram.org',
|
|
|
|
'contest.com',
|
2022-03-30 09:19:53 +02:00
|
|
|
|
2022-04-07 08:16:37 +02:00
|
|
|
# web apps beta
|
|
|
|
'comments.app/test_webview', # old
|
|
|
|
'webappcontent.telegram.org/demo', # new
|
2022-04-09 14:21:04 +02:00
|
|
|
'webappcontent.telegram.org/cafe', # demo 2
|
2022-04-24 09:43:32 +02:00
|
|
|
# 'a-webappcontent.stel.com/demo',
|
|
|
|
# 'a-webappcontent.stel.com/cafe',
|
2022-10-29 23:13:34 +02:00
|
|
|
|
2023-08-16 11:35:50 +02:00
|
|
|
# 'fragment.com/about',
|
2022-11-09 15:41:02 +01:00
|
|
|
# 'fragment.com/privacy',
|
|
|
|
# 'fragment.com/terms',
|
|
|
|
'fragment.com/css/auction.css',
|
2022-10-29 23:50:29 +02:00
|
|
|
'fragment.com/js/auction.js',
|
2021-04-23 22:31:23 +02:00
|
|
|
}
|
2021-06-01 08:57:10 +02:00
|
|
|
ADDITIONAL_URLS = {
|
2023-07-20 12:09:37 +02:00
|
|
|
'raw.githubusercontent.com/telegramdesktop/tdesktop/dev/Telegram/SourceFiles/mtproto/scheme/mtproto.tl',
|
|
|
|
'raw.githubusercontent.com/telegramdesktop/tdesktop/dev/Telegram/SourceFiles/mtproto/scheme/api.tl',
|
2021-08-31 14:18:25 +02:00
|
|
|
'raw.githubusercontent.com/tdlib/td/master/td/generate/scheme/telegram_api.tl',
|
2022-11-09 15:26:27 +01:00
|
|
|
'raw.githubusercontent.com/tdlib/td/master/td/generate/scheme/secret_api.tl',
|
2021-11-11 10:17:53 +01:00
|
|
|
'raw.githubusercontent.com/tdlib/td/master/td/generate/scheme/td_api.tl',
|
2021-06-01 08:57:10 +02:00
|
|
|
}
|
2021-04-23 20:34:42 +02:00
|
|
|
BASE_URL_REGEX = r'telegram.org'
|
|
|
|
|
2021-04-24 11:29:19 +02:00
|
|
|
# disable crawling sub links for specific domains and url patterns
|
|
|
|
CRAWL_RULES = {
|
|
|
|
# every rule is regex
|
|
|
|
# empty string means match any url
|
2021-04-25 14:13:44 +02:00
|
|
|
# allow rules with higher priority than deny
|
2021-04-23 20:34:42 +02:00
|
|
|
'translations.telegram.org': {
|
2021-04-24 11:29:19 +02:00
|
|
|
'allow': {
|
|
|
|
r'^[^/]*$', # root
|
|
|
|
r'org/[^/]*/$', # 1 lvl sub
|
2022-03-30 15:09:50 +02:00
|
|
|
r'/css/[a-z-_.]+$', # css files
|
2022-03-12 01:20:05 +01:00
|
|
|
r'/en/[a-z_]+/$', # 1 lvl after /en/
|
|
|
|
r'/en/[a-z_]+/[a-z_]+/$', # 2 lvl after /en/. for example, /en/ios/unsorted/
|
2021-04-24 11:29:19 +02:00
|
|
|
},
|
|
|
|
'deny': {
|
|
|
|
'', # all
|
2021-04-24 00:06:50 +02:00
|
|
|
}
|
2021-04-23 20:34:42 +02:00
|
|
|
},
|
2022-04-18 00:11:21 +02:00
|
|
|
'osx.telegram.org': {
|
|
|
|
'deny': {
|
|
|
|
'updates/Telegram'
|
|
|
|
}
|
|
|
|
},
|
2021-04-24 11:29:19 +02:00
|
|
|
'bugs.telegram.org': { # crawl first page of cards sorted by rating
|
|
|
|
'deny': {
|
2021-04-24 14:43:01 +02:00
|
|
|
# r'/c/[0-9]+/[0-9]+', # disable comments
|
|
|
|
'',
|
2021-04-24 00:06:50 +02:00
|
|
|
},
|
2021-04-23 20:34:42 +02:00
|
|
|
},
|
|
|
|
'instantview.telegram.org': {
|
2021-04-24 11:29:19 +02:00
|
|
|
'deny': {
|
|
|
|
r'templates/.+',
|
2021-04-24 00:06:50 +02:00
|
|
|
'samples/',
|
2021-05-05 12:58:54 +02:00
|
|
|
'contest',
|
2021-04-24 00:06:50 +02:00
|
|
|
},
|
2021-04-23 20:34:42 +02:00
|
|
|
},
|
|
|
|
'core.telegram.org': {
|
2021-04-24 11:29:19 +02:00
|
|
|
'deny': {
|
2021-05-01 15:10:20 +02:00
|
|
|
'bots/payments',
|
2021-04-24 11:29:19 +02:00
|
|
|
'tdlib/docs/classtd',
|
2021-05-01 16:28:02 +02:00
|
|
|
'validatedRequestedInfo',
|
2023-10-21 20:04:37 +02:00
|
|
|
'constructor/Updates',
|
2021-04-24 00:06:50 +02:00
|
|
|
},
|
2021-04-23 20:34:42 +02:00
|
|
|
},
|
2021-09-09 20:43:53 +02:00
|
|
|
'corefork.telegram.org': {
|
|
|
|
'deny': {
|
|
|
|
'bots/payments',
|
|
|
|
'tdlib/docs/classtd',
|
|
|
|
'validatedRequestedInfo',
|
2023-10-21 20:04:37 +02:00
|
|
|
'constructor/Updates',
|
2021-09-09 20:43:53 +02:00
|
|
|
},
|
|
|
|
},
|
2022-05-14 01:06:40 +02:00
|
|
|
'blogfork.telegram.org': {
|
|
|
|
'deny': {
|
|
|
|
'bots/payments',
|
|
|
|
'tdlib/docs/classtd',
|
|
|
|
'validatedRequestedInfo',
|
2023-10-21 20:04:37 +02:00
|
|
|
'constructor/Updates',
|
2022-05-14 01:06:40 +02:00
|
|
|
},
|
|
|
|
},
|
2021-04-23 20:34:42 +02:00
|
|
|
'telegram.org': {
|
2021-04-24 11:29:19 +02:00
|
|
|
'deny': {
|
2022-04-10 17:46:36 +02:00
|
|
|
r'apps$',
|
2022-05-27 17:13:04 +02:00
|
|
|
r'img/emoji/.+',
|
2022-04-17 23:31:44 +02:00
|
|
|
r'img/StickerExample.psd$',
|
2024-06-02 16:05:39 +02:00
|
|
|
r'/privacy$', # geolocation depended
|
|
|
|
r'/tos$', # geolocation depended
|
2021-04-24 00:06:50 +02:00
|
|
|
},
|
2021-05-01 15:10:20 +02:00
|
|
|
},
|
|
|
|
'webz.telegram.org': {
|
|
|
|
'deny': {
|
|
|
|
'',
|
|
|
|
},
|
|
|
|
},
|
|
|
|
'webk.telegram.org': {
|
|
|
|
'deny': {
|
|
|
|
'',
|
|
|
|
},
|
|
|
|
},
|
2021-04-23 20:34:42 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
DIRECT_LINK_REGEX = r'([-a-zA-Z0-9@:%._\+~#]{0,249}' + BASE_URL_REGEX + r')'
|
2021-04-24 22:40:25 +02:00
|
|
|
ABSOLUTE_LINK_REGEX = r'([-a-zA-Z0-9@:%._\+~#]{0,248}' + BASE_URL_REGEX + r'\b[-a-zA-Z0-9@:%_\+.~#?&//=]*)'
|
|
|
|
RELATIVE_LINK_REGEX = r'\/(?!\/)([-a-zA-Z0-9\/@:%._\+~#]{0,249})'
|
2022-04-10 19:33:33 +02:00
|
|
|
RELATIVE_JS_SCRIPTS_REGEX = r'["\'](.*\.js)["\'\?]'
|
2021-04-23 20:34:42 +02:00
|
|
|
|
|
|
|
DOM_ATTRS = ['href', 'src']
|
|
|
|
|
2021-04-23 21:47:15 +02:00
|
|
|
OUTPUT_FILENAME = os.environ.get('OUTPUT_FILENAME', 'tracked_links.txt')
|
2022-05-13 11:57:11 +02:00
|
|
|
OUTPUT_RESOURCES_FILENAME = os.environ.get('OUTPUT_RESOURCES_FILENAME', 'tracked_res_links.txt')
|
2022-06-18 17:36:39 +02:00
|
|
|
OUTPUT_TRANSLATIONS_FILENAME = os.environ.get('OUTPUT_TRANSLATIONS_FILENAME', 'tracked_tr_links.txt')
|
2021-04-23 20:34:42 +02:00
|
|
|
|
2022-05-13 11:57:11 +02:00
|
|
|
STEL_DEV_LAYER = 190
|
2022-05-11 11:10:08 +02:00
|
|
|
|
2021-04-23 20:34:42 +02:00
|
|
|
# unsecure but so simple
|
2022-05-11 11:10:08 +02:00
|
|
|
CONNECTOR = aiohttp.TCPConnector(ssl=False, force_close=True, limit=300)
|
2022-03-12 01:20:05 +01:00
|
|
|
TIMEOUT = aiohttp.ClientTimeout(total=10)
|
2022-05-11 11:10:08 +02:00
|
|
|
HEADERS = {
|
|
|
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:99.0) Gecko/20100101 Firefox/99.0',
|
|
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
|
|
|
'Accept-Language': 'en-US,en;q=0.5',
|
|
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|
|
|
'DNT': '1',
|
|
|
|
'Connection': 'keep-alive',
|
2022-05-13 11:57:11 +02:00
|
|
|
'Cookie': f'stel_ln=en; stel_dev_layer={STEL_DEV_LAYER}',
|
2022-05-11 11:10:08 +02:00
|
|
|
'Upgrade-Insecure-Requests': '1',
|
|
|
|
'Sec-Fetch-Dest': 'document',
|
|
|
|
'Sec-Fetch-Mode': 'navigate',
|
|
|
|
'Sec-Fetch-Site': 'none',
|
|
|
|
'Sec-Fetch-User': '?1',
|
|
|
|
'Cache-Control': 'max-age=0',
|
|
|
|
'TE': 'trailers',
|
|
|
|
}
|
2021-04-23 20:34:42 +02:00
|
|
|
|
2024-06-02 16:05:39 +02:00
|
|
|
logging.basicConfig(format='%(asctime)s %(levelname)s - %(message)s', level=logging.INFO)
|
2021-04-23 20:34:42 +02:00
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
VISITED_LINKS = set()
|
|
|
|
LINKS_TO_TRACK = set()
|
2022-06-18 17:36:39 +02:00
|
|
|
LINKS_TO_TRANSLATIONS = set()
|
2022-05-13 11:57:11 +02:00
|
|
|
LINKS_TO_TRACKABLE_RESOURCES = set()
|
2021-04-23 20:34:42 +02:00
|
|
|
|
2024-06-02 16:05:39 +02:00
|
|
|
WORKERS_COUNT = 30
|
|
|
|
WORKERS_TASK_QUEUE = Queue()
|
|
|
|
|
2021-04-23 20:34:42 +02:00
|
|
|
|
2024-06-02 16:05:39 +02:00
|
|
|
@cache
|
2021-04-24 11:29:19 +02:00
|
|
|
def should_exclude(url: str) -> bool:
|
|
|
|
direct_link = re.findall(DIRECT_LINK_REGEX, url)[0]
|
|
|
|
domain_rules = CRAWL_RULES.get(direct_link)
|
|
|
|
if not domain_rules:
|
|
|
|
return False
|
2021-04-23 20:34:42 +02:00
|
|
|
|
2021-04-24 11:29:19 +02:00
|
|
|
allow_rules = domain_rules.get('allow', set())
|
|
|
|
deny_rules = domain_rules.get('deny', set())
|
2021-04-24 00:06:50 +02:00
|
|
|
|
2021-04-24 11:29:19 +02:00
|
|
|
exclude = False
|
2021-04-24 00:06:50 +02:00
|
|
|
|
2021-04-24 11:29:19 +02:00
|
|
|
for regex in deny_rules:
|
|
|
|
if re.search(regex, url):
|
|
|
|
exclude = True
|
|
|
|
break
|
2021-04-23 20:34:42 +02:00
|
|
|
|
2021-04-24 11:29:19 +02:00
|
|
|
for regex in allow_rules:
|
|
|
|
if re.search(regex, url):
|
|
|
|
exclude = False
|
|
|
|
break
|
2021-04-23 20:34:42 +02:00
|
|
|
|
2024-06-02 16:05:39 +02:00
|
|
|
if exclude:
|
|
|
|
logger.debug('Exclude %s by rules', url)
|
|
|
|
|
2021-04-24 11:29:19 +02:00
|
|
|
return exclude
|
2021-04-23 20:34:42 +02:00
|
|
|
|
|
|
|
|
2023-10-21 20:04:37 +02:00
|
|
|
def find_absolute_links(html: str) -> Set[str]:
|
2021-04-24 00:06:50 +02:00
|
|
|
absolute_links = set(re.findall(ABSOLUTE_LINK_REGEX, html))
|
|
|
|
|
|
|
|
return {link for link in absolute_links if not should_exclude(link)}
|
2021-04-23 20:34:42 +02:00
|
|
|
|
|
|
|
|
2023-10-21 20:04:37 +02:00
|
|
|
def find_relative_links(html: str, cur_link: str) -> Set[str]:
|
2021-10-26 17:55:08 +02:00
|
|
|
matches = re.findall(DIRECT_LINK_REGEX, cur_link)
|
|
|
|
if not matches:
|
|
|
|
return set()
|
|
|
|
|
2021-04-23 20:34:42 +02:00
|
|
|
direct_cur_link = re.findall(DIRECT_LINK_REGEX, cur_link)[0]
|
|
|
|
# optimization. when we want to exclude domain
|
2021-04-24 00:06:50 +02:00
|
|
|
if should_exclude(cur_link):
|
2021-04-23 20:34:42 +02:00
|
|
|
return set()
|
|
|
|
|
|
|
|
relative_links = set()
|
|
|
|
for attr in DOM_ATTRS:
|
|
|
|
regex = f'{attr}="{RELATIVE_LINK_REGEX}'
|
|
|
|
links = re.findall(regex, html)
|
|
|
|
|
|
|
|
for link in links:
|
2021-04-24 22:40:25 +02:00
|
|
|
url = f'{direct_cur_link}/{link}'
|
|
|
|
if not should_exclude(url):
|
|
|
|
relative_links.add(url)
|
2021-04-23 20:34:42 +02:00
|
|
|
|
|
|
|
return relative_links
|
|
|
|
|
|
|
|
|
2023-10-21 20:04:37 +02:00
|
|
|
def find_relative_scripts(code: str, cur_link: str) -> Set[str]:
|
2022-04-10 19:33:33 +02:00
|
|
|
matches = re.findall(DIRECT_LINK_REGEX, cur_link)
|
|
|
|
if not matches:
|
|
|
|
return set()
|
|
|
|
|
|
|
|
direct_cur_link = re.findall(DIRECT_LINK_REGEX, cur_link)[0]
|
|
|
|
|
|
|
|
relative_links = set()
|
|
|
|
for link in re.findall(RELATIVE_JS_SCRIPTS_REGEX, code):
|
|
|
|
# dirty magic for specific cases
|
|
|
|
if '/' in link: # path to file from the root
|
|
|
|
url = f'{direct_cur_link}/{link}'
|
2024-06-02 16:05:39 +02:00
|
|
|
else: # it is a relative link from the current folder. not from the root
|
2022-04-10 19:33:33 +02:00
|
|
|
current_folder_link, *_ = cur_link.rsplit('/', 1)
|
|
|
|
url = f'{current_folder_link}/{link}'
|
|
|
|
|
|
|
|
if not should_exclude(url):
|
|
|
|
relative_links.add(url)
|
|
|
|
|
|
|
|
return relative_links
|
|
|
|
|
|
|
|
|
2023-10-21 20:04:37 +02:00
|
|
|
def cleanup_links(links: Set[str]) -> Set[str]:
|
2021-04-23 20:34:42 +02:00
|
|
|
cleaned_links = set()
|
|
|
|
for tmp_link in links:
|
|
|
|
# normalize link
|
|
|
|
link = unquote(tmp_link)
|
|
|
|
link = unescape(link)
|
|
|
|
link = link.replace('www.', '')
|
|
|
|
link = link.replace('http://', '').replace('https://', '')
|
2022-04-10 19:33:33 +02:00
|
|
|
link = link.replace('//', '/') # not a universal solution
|
2022-04-17 23:31:44 +02:00
|
|
|
link = link.replace('"', '') # regex fix hack
|
2021-04-24 22:40:25 +02:00
|
|
|
|
2021-04-23 20:34:42 +02:00
|
|
|
# skip anchor links
|
|
|
|
if '#' in link:
|
|
|
|
continue
|
|
|
|
|
2021-04-24 22:40:25 +02:00
|
|
|
# remove get params from link
|
|
|
|
if '?' in link:
|
|
|
|
link = ''.join(link.split('?')[:-1])
|
|
|
|
|
|
|
|
# skip mailto:
|
2021-04-23 20:34:42 +02:00
|
|
|
link_parts = link.split('.')
|
|
|
|
if '@' in link_parts[0]:
|
|
|
|
continue
|
|
|
|
|
2022-10-29 23:13:34 +02:00
|
|
|
# fix wildcard
|
|
|
|
if link.startswith('.'):
|
|
|
|
link = link[1:]
|
|
|
|
|
2021-04-23 20:34:42 +02:00
|
|
|
cleaned_links.add(link)
|
|
|
|
|
|
|
|
return cleaned_links
|
|
|
|
|
|
|
|
|
2023-10-21 20:04:37 +02:00
|
|
|
def _is_x_content_type(content_types_set: Set[str], content_type) -> bool:
|
2022-05-13 11:57:11 +02:00
|
|
|
for match_content_type in content_types_set:
|
|
|
|
if match_content_type in content_type:
|
|
|
|
return True
|
|
|
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
2022-06-18 17:36:39 +02:00
|
|
|
def is_translation_url(url: str) -> bool:
|
|
|
|
return 'translations.telegram.org' in url
|
|
|
|
|
|
|
|
|
2022-05-13 11:57:11 +02:00
|
|
|
def is_textable_content_type(content_type: str) -> bool:
|
|
|
|
textable_content_type = {
|
2022-04-10 17:46:36 +02:00
|
|
|
'plain',
|
2022-05-13 11:57:11 +02:00
|
|
|
'css',
|
2022-04-10 17:46:36 +02:00
|
|
|
'json',
|
2022-05-13 11:57:11 +02:00
|
|
|
'text',
|
|
|
|
'javascript',
|
|
|
|
}
|
|
|
|
|
|
|
|
return _is_x_content_type(textable_content_type, content_type)
|
|
|
|
|
|
|
|
|
|
|
|
def is_trackable_content_type(content_type) -> bool:
|
|
|
|
trackable_content_types = {
|
2022-04-10 17:46:36 +02:00
|
|
|
'svg',
|
|
|
|
'png',
|
|
|
|
'jpeg',
|
|
|
|
'x-icon',
|
|
|
|
'gif',
|
|
|
|
'mp4',
|
|
|
|
'webm',
|
2022-04-17 23:31:44 +02:00
|
|
|
'application/octet-stream', # td updates
|
2022-04-24 09:02:48 +02:00
|
|
|
'application/zip',
|
2022-05-13 11:57:11 +02:00
|
|
|
}
|
2022-04-10 17:46:36 +02:00
|
|
|
|
2022-05-13 11:57:11 +02:00
|
|
|
return _is_x_content_type(trackable_content_types, content_type)
|
2022-04-10 17:46:36 +02:00
|
|
|
|
|
|
|
|
2022-05-11 11:10:08 +02:00
|
|
|
class ServerSideError(Exception):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
2024-06-02 16:05:39 +02:00
|
|
|
async def crawl_worker(session: aiohttp.ClientSession):
|
|
|
|
while not WORKERS_TASK_QUEUE.empty():
|
|
|
|
url = WORKERS_TASK_QUEUE.get_nowait()
|
|
|
|
|
2022-05-11 11:10:08 +02:00
|
|
|
try:
|
|
|
|
await _crawl(url, session)
|
|
|
|
except (ServerSideError, ServerDisconnectedError, TimeoutError, ClientConnectorError):
|
|
|
|
logger.warning(f'Client or timeout error. Retrying {url}')
|
2022-06-19 20:53:10 +02:00
|
|
|
|
2024-06-02 16:05:39 +02:00
|
|
|
WORKERS_TASK_QUEUE.put_nowait(url)
|
2022-06-19 20:53:10 +02:00
|
|
|
if url in VISITED_LINKS:
|
|
|
|
VISITED_LINKS.remove(url)
|
2022-05-11 11:10:08 +02:00
|
|
|
|
|
|
|
|
|
|
|
async def _crawl(url: str, session: aiohttp.ClientSession):
|
2021-04-24 22:40:25 +02:00
|
|
|
if url in VISITED_LINKS:
|
2021-04-23 20:34:42 +02:00
|
|
|
return
|
2021-04-24 22:40:25 +02:00
|
|
|
VISITED_LINKS.add(url)
|
2021-04-23 20:34:42 +02:00
|
|
|
|
|
|
|
try:
|
2024-06-02 16:05:39 +02:00
|
|
|
logger.debug('[%s] Process %s', len(VISITED_LINKS), url)
|
2021-04-25 08:17:58 +02:00
|
|
|
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response:
|
2021-04-23 20:34:42 +02:00
|
|
|
content_type = response.headers.get('content-type')
|
|
|
|
|
2022-05-13 11:57:11 +02:00
|
|
|
if 499 < response.status < 600:
|
2021-12-30 14:22:01 +01:00
|
|
|
VISITED_LINKS.remove(url)
|
2022-02-24 21:58:17 +01:00
|
|
|
logger.warning(f'Error 5XX. Retrying {url}')
|
2022-05-11 11:10:08 +02:00
|
|
|
raise ServerSideError()
|
2021-06-21 16:09:49 +02:00
|
|
|
|
2021-12-30 14:22:01 +01:00
|
|
|
if response.status not in {200, 304}:
|
2021-06-21 18:39:20 +02:00
|
|
|
if response.status != 302:
|
2021-12-30 14:22:01 +01:00
|
|
|
content = await response.text(encoding='UTF-8')
|
2024-06-02 16:05:39 +02:00
|
|
|
logger.warning(f'Skip {url} because status code == {response.status}. Content: {content}')
|
2021-04-23 20:34:42 +02:00
|
|
|
return
|
|
|
|
|
2022-05-13 11:57:11 +02:00
|
|
|
if is_textable_content_type(content_type):
|
2024-06-02 16:05:39 +02:00
|
|
|
# aiohttp will cache raw content. we don't worry about it
|
2022-06-29 11:04:34 +02:00
|
|
|
raw_content = await response.read()
|
|
|
|
content = await response.text(encoding='UTF-8')
|
|
|
|
|
2022-06-18 17:36:39 +02:00
|
|
|
if is_translation_url(url):
|
|
|
|
LINKS_TO_TRANSLATIONS.add(url)
|
2024-06-02 16:05:39 +02:00
|
|
|
logger.debug('Add %s to LINKS_TO_TRANSLATIONS', url)
|
2022-06-18 17:36:39 +02:00
|
|
|
else:
|
|
|
|
LINKS_TO_TRACK.add(url)
|
2024-06-02 16:05:39 +02:00
|
|
|
logger.debug('Add %s to LINKS_TO_TRACK', url)
|
2021-04-23 20:34:42 +02:00
|
|
|
|
2022-04-10 19:33:33 +02:00
|
|
|
absolute_links = cleanup_links(find_absolute_links(content))
|
|
|
|
|
|
|
|
relative_links_finder = find_relative_links
|
|
|
|
if 'javascript' in content_type:
|
|
|
|
relative_links_finder = find_relative_scripts
|
|
|
|
|
|
|
|
relative_links = cleanup_links(relative_links_finder(content, url))
|
2021-04-23 20:34:42 +02:00
|
|
|
|
2021-04-23 20:44:17 +02:00
|
|
|
sub_links = absolute_links | relative_links
|
2024-06-02 16:05:39 +02:00
|
|
|
for sub_url in sub_links:
|
|
|
|
if sub_url not in VISITED_LINKS:
|
|
|
|
WORKERS_TASK_QUEUE.put_nowait(sub_url)
|
2022-04-10 17:46:36 +02:00
|
|
|
elif is_trackable_content_type(content_type):
|
2022-05-13 11:57:11 +02:00
|
|
|
LINKS_TO_TRACKABLE_RESOURCES.add(url)
|
2024-06-02 16:05:39 +02:00
|
|
|
logger.debug('Add %s to LINKS_TO_TRACKABLE_RESOURCES', url)
|
2021-04-23 20:34:42 +02:00
|
|
|
else:
|
2022-04-10 17:46:36 +02:00
|
|
|
# for example, zip with update of macOS client
|
2024-06-02 16:05:39 +02:00
|
|
|
logger.warning(f'Unhandled type: {content_type} from {url}')
|
2021-04-24 22:40:25 +02:00
|
|
|
|
2024-06-02 16:05:39 +02:00
|
|
|
# telegram url can work with and without a trailing slash (no redirect).
|
|
|
|
# note: not on every subdomain ;d
|
|
|
|
# so this is a problem when we have random behavior with a link will be added
|
|
|
|
# this if resolve this issue.
|
|
|
|
# if available both links, we prefer without a trailing slash
|
2022-06-18 17:36:39 +02:00
|
|
|
for links_set in (LINKS_TO_TRACK, LINKS_TO_TRANSLATIONS, LINKS_TO_TRACKABLE_RESOURCES):
|
|
|
|
without_trailing_slash = url[:-1:] if url.endswith('/') else url
|
|
|
|
if without_trailing_slash in links_set and f'{without_trailing_slash}/' in links_set:
|
|
|
|
links_set.remove(f'{without_trailing_slash}/')
|
2024-06-02 16:05:39 +02:00
|
|
|
logger.debug('Remove %s/', without_trailing_slash)
|
2021-04-24 11:29:19 +02:00
|
|
|
except UnicodeDecodeError:
|
2022-04-10 17:46:36 +02:00
|
|
|
logger.warning(f'Codec can\'t decode bytes. So it was a tgs file or response with broken content type {url}')
|
2022-05-13 11:57:11 +02:00
|
|
|
|
2022-06-29 11:04:34 +02:00
|
|
|
if raw_content.startswith(b'GIF'):
|
2022-05-13 11:57:11 +02:00
|
|
|
LINKS_TO_TRACKABLE_RESOURCES.add(url)
|
2024-06-02 16:05:39 +02:00
|
|
|
logger.debug('Add %s to LINKS_TO_TRACKABLE_RESOURCES (raw content)', url)
|
2021-04-23 20:34:42 +02:00
|
|
|
|
|
|
|
|
2023-10-21 20:04:37 +02:00
|
|
|
async def start(url_list: Set[str]):
|
2024-06-02 16:05:39 +02:00
|
|
|
for url in url_list:
|
|
|
|
WORKERS_TASK_QUEUE.put_nowait(url)
|
|
|
|
|
2022-05-11 11:10:08 +02:00
|
|
|
async with aiohttp.ClientSession(connector=CONNECTOR, headers=HEADERS) as session:
|
2024-06-02 16:05:39 +02:00
|
|
|
await asyncio.gather(*[crawl_worker(session) for _ in range(WORKERS_COUNT)])
|
2021-04-23 22:31:23 +02:00
|
|
|
|
|
|
|
|
2021-04-23 20:34:42 +02:00
|
|
|
if __name__ == '__main__':
|
2021-04-23 22:31:23 +02:00
|
|
|
HIDDEN_URLS.add(BASE_URL)
|
2021-06-01 08:57:10 +02:00
|
|
|
LINKS_TO_TRACK = LINKS_TO_TRACK | ADDITIONAL_URLS
|
2021-04-23 22:31:23 +02:00
|
|
|
|
2021-04-24 14:19:01 +02:00
|
|
|
logger.info('Start crawling links...')
|
2021-04-23 22:49:21 +02:00
|
|
|
start_time = time()
|
2021-04-23 22:31:23 +02:00
|
|
|
asyncio.get_event_loop().run_until_complete(start(HIDDEN_URLS))
|
2021-04-24 14:19:01 +02:00
|
|
|
logger.info(f'Stop crawling links. {time() - start_time} sec.')
|
2021-04-23 20:34:42 +02:00
|
|
|
|
2021-04-24 22:40:25 +02:00
|
|
|
try:
|
2022-05-13 11:57:11 +02:00
|
|
|
OLD_URL_LIST = set()
|
2022-06-29 11:04:34 +02:00
|
|
|
for filename in (OUTPUT_FILENAME, OUTPUT_RESOURCES_FILENAME, OUTPUT_TRANSLATIONS_FILENAME):
|
2022-05-13 11:57:11 +02:00
|
|
|
with open(filename, 'r') as f:
|
|
|
|
OLD_URL_LIST |= set([l.replace('\n', '') for l in f.readlines()])
|
2021-04-24 22:40:25 +02:00
|
|
|
|
2022-06-18 17:36:39 +02:00
|
|
|
CURRENT_URL_LIST = LINKS_TO_TRACK | LINKS_TO_TRACKABLE_RESOURCES | LINKS_TO_TRANSLATIONS
|
2022-05-13 11:57:11 +02:00
|
|
|
|
|
|
|
logger.info(f'Is equal: {OLD_URL_LIST == CURRENT_URL_LIST}')
|
2024-06-02 16:05:39 +02:00
|
|
|
logger.info(f'Deleted ({len(OLD_URL_LIST - CURRENT_URL_LIST)}): {OLD_URL_LIST - CURRENT_URL_LIST}')
|
|
|
|
logger.info(f'Added ({len(CURRENT_URL_LIST - OLD_URL_LIST)}): {CURRENT_URL_LIST - OLD_URL_LIST}')
|
2021-04-24 22:40:25 +02:00
|
|
|
except IOError:
|
|
|
|
pass
|
|
|
|
|
2021-04-23 20:34:42 +02:00
|
|
|
with open(OUTPUT_FILENAME, 'w') as f:
|
|
|
|
f.write('\n'.join(sorted(LINKS_TO_TRACK)))
|
2022-05-13 11:57:11 +02:00
|
|
|
|
|
|
|
with open(OUTPUT_RESOURCES_FILENAME, 'w') as f:
|
|
|
|
f.write('\n'.join(sorted(LINKS_TO_TRACKABLE_RESOURCES)))
|
2022-06-18 17:36:39 +02:00
|
|
|
|
|
|
|
with open(OUTPUT_TRANSLATIONS_FILENAME, 'w') as f:
|
|
|
|
f.write('\n'.join(sorted(LINKS_TO_TRANSLATIONS)))
|