control concurrent requests; a little optimization; upgrade python and deps

This commit is contained in:
Ilya (Marshal) 2024-06-02 16:05:39 +02:00
parent 463cb40112
commit 1f8a2678ba
No known key found for this signature in database
GPG key ID: F8FB1A006AD3156D
4 changed files with 50 additions and 31 deletions

View file

@ -18,14 +18,14 @@ jobs:
steps: steps:
- name: Clone. - name: Clone.
uses: actions/checkout@v2 uses: actions/checkout@v4
with: with:
token: ${{ secrets.PAT }} token: ${{ secrets.PAT }}
- name: Setup Python. - name: Setup Python.
uses: actions/setup-python@v2 uses: actions/setup-python@v5
with: with:
python-version: 3.9 python-version: 3.12
- name: Install dependencies. - name: Install dependencies.
run: | run: |

View file

@ -2,7 +2,9 @@ import asyncio
import logging import logging
import os import os
import re import re
from asyncio import Queue
from asyncio.exceptions import TimeoutError from asyncio.exceptions import TimeoutError
from functools import cache
from html import unescape from html import unescape
from time import time from time import time
from typing import Set from typing import Set
@ -11,6 +13,7 @@ from urllib.parse import unquote
import aiohttp import aiohttp
from aiohttp import ClientConnectorError, ServerDisconnectedError from aiohttp import ClientConnectorError, ServerDisconnectedError
PROTOCOL = 'https://' PROTOCOL = 'https://'
BASE_URL = 'telegram.org' BASE_URL = 'telegram.org'
# it's necessary to help crawler to find more links # it's necessary to help crawler to find more links
@ -21,11 +24,12 @@ HIDDEN_URLS = {
'corefork.telegram.org/getProxyConfig', 'corefork.telegram.org/getProxyConfig',
'telegram.org/privacy/gmailbot', 'telegram.org/privacy/gmailbot',
'telegram.org/tos',
'telegram.org/tos/mini-apps', 'telegram.org/tos/mini-apps',
'telegram.org/tos/p2pl', 'telegram.org/tos/p2pl',
'telegram.org/tour', 'telegram.org/tour',
'telegram.org/evolution', 'telegram.org/evolution',
'telegram.org/tos/bots',
'telegram.org/tos/business',
'desktop.telegram.org/changelog', 'desktop.telegram.org/changelog',
'td.telegram.org/current', 'td.telegram.org/current',
@ -133,6 +137,8 @@ CRAWL_RULES = {
r'apps$', r'apps$',
r'img/emoji/.+', r'img/emoji/.+',
r'img/StickerExample.psd$', r'img/StickerExample.psd$',
r'/privacy$', # geolocation depended
r'/tos$', # geolocation depended
}, },
}, },
'webz.telegram.org': { 'webz.telegram.org': {
@ -180,7 +186,7 @@ HEADERS = {
'TE': 'trailers', 'TE': 'trailers',
} }
logging.basicConfig(format='%(message)s', level=logging.DEBUG) logging.basicConfig(format='%(asctime)s %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
VISITED_LINKS = set() VISITED_LINKS = set()
@ -188,7 +194,11 @@ LINKS_TO_TRACK = set()
LINKS_TO_TRANSLATIONS = set() LINKS_TO_TRANSLATIONS = set()
LINKS_TO_TRACKABLE_RESOURCES = set() LINKS_TO_TRACKABLE_RESOURCES = set()
WORKERS_COUNT = 30
WORKERS_TASK_QUEUE = Queue()
@cache
def should_exclude(url: str) -> bool: def should_exclude(url: str) -> bool:
direct_link = re.findall(DIRECT_LINK_REGEX, url)[0] direct_link = re.findall(DIRECT_LINK_REGEX, url)[0]
domain_rules = CRAWL_RULES.get(direct_link) domain_rules = CRAWL_RULES.get(direct_link)
@ -210,6 +220,9 @@ def should_exclude(url: str) -> bool:
exclude = False exclude = False
break break
if exclude:
logger.debug('Exclude %s by rules', url)
return exclude return exclude
@ -254,7 +267,7 @@ def find_relative_scripts(code: str, cur_link: str) -> Set[str]:
# dirty magic for specific cases # dirty magic for specific cases
if '/' in link: # path to file from the root if '/' in link: # path to file from the root
url = f'{direct_cur_link}/{link}' url = f'{direct_cur_link}/{link}'
else: # its relative link from current folder. Not from the root else: # it is a relative link from the current folder. not from the root
current_folder_link, *_ = cur_link.rsplit('/', 1) current_folder_link, *_ = cur_link.rsplit('/', 1)
url = f'{current_folder_link}/{link}' url = f'{current_folder_link}/{link}'
@ -341,17 +354,18 @@ class ServerSideError(Exception):
pass pass
async def crawl(url: str, session: aiohttp.ClientSession): async def crawl_worker(session: aiohttp.ClientSession):
while True: while not WORKERS_TASK_QUEUE.empty():
url = WORKERS_TASK_QUEUE.get_nowait()
try: try:
await _crawl(url, session) await _crawl(url, session)
except (ServerSideError, ServerDisconnectedError, TimeoutError, ClientConnectorError): except (ServerSideError, ServerDisconnectedError, TimeoutError, ClientConnectorError):
logger.warning(f'Client or timeout error. Retrying {url}') logger.warning(f'Client or timeout error. Retrying {url}')
WORKERS_TASK_QUEUE.put_nowait(url)
if url in VISITED_LINKS: if url in VISITED_LINKS:
VISITED_LINKS.remove(url) VISITED_LINKS.remove(url)
else:
break
async def _crawl(url: str, session: aiohttp.ClientSession): async def _crawl(url: str, session: aiohttp.ClientSession):
@ -360,7 +374,7 @@ async def _crawl(url: str, session: aiohttp.ClientSession):
VISITED_LINKS.add(url) VISITED_LINKS.add(url)
try: try:
logger.info(f'[{len(VISITED_LINKS)}] Process {url}') logger.debug('[%s] Process %s', len(VISITED_LINKS), url)
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response: async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response:
content_type = response.headers.get('content-type') content_type = response.headers.get('content-type')
@ -372,20 +386,20 @@ async def _crawl(url: str, session: aiohttp.ClientSession):
if response.status not in {200, 304}: if response.status not in {200, 304}:
if response.status != 302: if response.status != 302:
content = await response.text(encoding='UTF-8') content = await response.text(encoding='UTF-8')
logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}') logger.warning(f'Skip {url} because status code == {response.status}. Content: {content}')
return return
if is_textable_content_type(content_type): if is_textable_content_type(content_type):
# raw content will be cached by aiohttp. Don't worry about it # aiohttp will cache raw content. we don't worry about it
raw_content = await response.read() raw_content = await response.read()
content = await response.text(encoding='UTF-8') content = await response.text(encoding='UTF-8')
if is_translation_url(url): if is_translation_url(url):
LINKS_TO_TRANSLATIONS.add(url) LINKS_TO_TRANSLATIONS.add(url)
logger.info(f'add {url} to LINKS_TO_TRANSLATIONS') logger.debug('Add %s to LINKS_TO_TRANSLATIONS', url)
else: else:
LINKS_TO_TRACK.add(url) LINKS_TO_TRACK.add(url)
logger.info(f'add {url} to LINKS_TO_TRACK') logger.debug('Add %s to LINKS_TO_TRACK', url)
absolute_links = cleanup_links(find_absolute_links(content)) absolute_links = cleanup_links(find_absolute_links(content))
@ -396,33 +410,40 @@ async def _crawl(url: str, session: aiohttp.ClientSession):
relative_links = cleanup_links(relative_links_finder(content, url)) relative_links = cleanup_links(relative_links_finder(content, url))
sub_links = absolute_links | relative_links sub_links = absolute_links | relative_links
await asyncio.gather(*[crawl(url, session) for url in sub_links]) for sub_url in sub_links:
if sub_url not in VISITED_LINKS:
WORKERS_TASK_QUEUE.put_nowait(sub_url)
elif is_trackable_content_type(content_type): elif is_trackable_content_type(content_type):
LINKS_TO_TRACKABLE_RESOURCES.add(url) LINKS_TO_TRACKABLE_RESOURCES.add(url)
logger.info(f'add {url} to LINKS_TO_TRACKABLE_RESOURCES') logger.debug('Add %s to LINKS_TO_TRACKABLE_RESOURCES', url)
else: else:
# for example, zip with update of macOS client # for example, zip with update of macOS client
logger.info(f'Unhandled type: {content_type} from {url}') logger.warning(f'Unhandled type: {content_type} from {url}')
# telegram url can work with and without trailing slash (no redirect). P.S. not on every subdomain ;d # telegram url can work with and without a trailing slash (no redirect).
# so this is a problem when we have random behavior with link will be added # note: not on every subdomain ;d
# this if resolve this issue. If available both link we prefer without trailing slash # so this is a problem when we have random behavior with a link will be added
# this if resolve this issue.
# if available both links, we prefer without a trailing slash
for links_set in (LINKS_TO_TRACK, LINKS_TO_TRANSLATIONS, LINKS_TO_TRACKABLE_RESOURCES): for links_set in (LINKS_TO_TRACK, LINKS_TO_TRANSLATIONS, LINKS_TO_TRACKABLE_RESOURCES):
without_trailing_slash = url[:-1:] if url.endswith('/') else url without_trailing_slash = url[:-1:] if url.endswith('/') else url
if without_trailing_slash in links_set and f'{without_trailing_slash}/' in links_set: if without_trailing_slash in links_set and f'{without_trailing_slash}/' in links_set:
links_set.remove(f'{without_trailing_slash}/') links_set.remove(f'{without_trailing_slash}/')
logger.info(f'remove {without_trailing_slash}/') logger.debug('Remove %s/', without_trailing_slash)
except UnicodeDecodeError: except UnicodeDecodeError:
logger.warning(f'Codec can\'t decode bytes. So it was a tgs file or response with broken content type {url}') logger.warning(f'Codec can\'t decode bytes. So it was a tgs file or response with broken content type {url}')
if raw_content.startswith(b'GIF'): if raw_content.startswith(b'GIF'):
LINKS_TO_TRACKABLE_RESOURCES.add(url) LINKS_TO_TRACKABLE_RESOURCES.add(url)
logger.info(f'add {url} to LINKS_TO_TRACKABLE_RESOURCES (raw content)') logger.debug('Add %s to LINKS_TO_TRACKABLE_RESOURCES (raw content)', url)
async def start(url_list: Set[str]): async def start(url_list: Set[str]):
for url in url_list:
WORKERS_TASK_QUEUE.put_nowait(url)
async with aiohttp.ClientSession(connector=CONNECTOR, headers=HEADERS) as session: async with aiohttp.ClientSession(connector=CONNECTOR, headers=HEADERS) as session:
await asyncio.gather(*[crawl(url, session) for url in url_list]) await asyncio.gather(*[crawl_worker(session) for _ in range(WORKERS_COUNT)])
if __name__ == '__main__': if __name__ == '__main__':
@ -443,8 +464,8 @@ if __name__ == '__main__':
CURRENT_URL_LIST = LINKS_TO_TRACK | LINKS_TO_TRACKABLE_RESOURCES | LINKS_TO_TRANSLATIONS CURRENT_URL_LIST = LINKS_TO_TRACK | LINKS_TO_TRACKABLE_RESOURCES | LINKS_TO_TRANSLATIONS
logger.info(f'Is equal: {OLD_URL_LIST == CURRENT_URL_LIST}') logger.info(f'Is equal: {OLD_URL_LIST == CURRENT_URL_LIST}')
logger.info(f'Deleted: {OLD_URL_LIST - CURRENT_URL_LIST}') logger.info(f'Deleted ({len(OLD_URL_LIST - CURRENT_URL_LIST)}): {OLD_URL_LIST - CURRENT_URL_LIST}')
logger.info(f'Added: {CURRENT_URL_LIST - OLD_URL_LIST}') logger.info(f'Added ({len(CURRENT_URL_LIST - OLD_URL_LIST)}): {CURRENT_URL_LIST - OLD_URL_LIST}')
except IOError: except IOError:
pass pass

View file

@ -1,9 +1,9 @@
aiohttp==3.7.4.post0 aiohttp==3.9.5
aiodns==3.0.0 aiodns==3.2.0
aiofiles==0.6.0 aiofiles==0.6.0
git+https://github.com/MarshalX/pyrogram git+https://github.com/MarshalX/pyrogram
TgCrypto==1.2.3 TgCrypto==1.2.3
beautifulsoup4==4.11.1 beautifulsoup4==4.11.1
cssutils==2.4.2 cssutils==2.4.2
requests==2.31.0 requests==2.31.0
# uvloop==0.16.0 # uvloop==0.19.0

View file

@ -7558,11 +7558,9 @@ telegram.org/js/tgsticker-worker.js
telegram.org/js/tgsticker.js telegram.org/js/tgsticker.js
telegram.org/js/widget-frame.js telegram.org/js/widget-frame.js
telegram.org/press telegram.org/press
telegram.org/privacy
telegram.org/privacy/gmailbot telegram.org/privacy/gmailbot
telegram.org/support telegram.org/support
telegram.org/t.me/PremiumBot telegram.org/t.me/PremiumBot
telegram.org/tos
telegram.org/tos/bot-developers telegram.org/tos/bot-developers
telegram.org/tos/bots telegram.org/tos/bots
telegram.org/tos/business telegram.org/tos/business