mirror of
https://github.com/MarshalX/telegram-crawler.git
synced 2025-03-15 13:22:43 +01:00
stel dev layer, headers, loop instead of rec; workflow timeout; aiohttp limit increase.
This commit is contained in:
parent
cdc433274c
commit
c6e13dc54d
4 changed files with 116 additions and 67 deletions
2
.github/workflows/make_files_tree.yml
vendored
2
.github/workflows/make_files_tree.yml
vendored
|
@ -12,7 +12,7 @@ jobs:
|
|||
fetch_new_content:
|
||||
name: Make files tree
|
||||
runs-on: macos-10.15
|
||||
# timeout-minutes: 10
|
||||
timeout-minutes: 10
|
||||
|
||||
steps:
|
||||
|
||||
|
|
|
@ -44,9 +44,27 @@ SPARKLE_SE_REGEX = r';se=(.*?);'
|
|||
SPARKLE_SIG_TEMPLATE = f';sig={DYNAMIC_PART_MOCK};'
|
||||
SPARKLE_SE_TEMPLATE = f';se={DYNAMIC_PART_MOCK};'
|
||||
|
||||
stel_dev_layer = 132
|
||||
|
||||
# unsecure but so simple
|
||||
CONNECTOR = aiohttp.TCPConnector(ssl=False)
|
||||
CONNECTOR = aiohttp.TCPConnector(ssl=False, force_close=True, limit=300)
|
||||
TIMEOUT = aiohttp.ClientTimeout(total=10)
|
||||
HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:99.0) Gecko/20100101 Firefox/99.0',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'DNT': '1',
|
||||
'Connection': 'keep-alive',
|
||||
'Cookie': f'stel_ln=en; stel_dev_layer={stel_dev_layer}',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Cache-Control': 'max-age=0',
|
||||
'TE': 'trailers',
|
||||
}
|
||||
|
||||
logging.basicConfig(format='%(message)s', level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -56,7 +74,7 @@ def get_hash(data: bytes) -> str:
|
|||
return hashlib.sha256(data).hexdigest()
|
||||
|
||||
|
||||
async def download_file(url, path, session):
|
||||
async def download_file(url: str, path: str, session: aiohttp.ClientSession):
|
||||
async with session.get(url) as response:
|
||||
if response.status != 200:
|
||||
return
|
||||
|
@ -437,70 +455,72 @@ class RetryError(Exception):
|
|||
|
||||
|
||||
async def crawl(url: str, session: aiohttp.ClientSession):
|
||||
try:
|
||||
# f*ck this shit. I believe it's temp solution
|
||||
if 'css/telegram.css' in url:
|
||||
ok = False
|
||||
while not ok:
|
||||
try:
|
||||
await _crawl(url, session)
|
||||
ok = True
|
||||
except (RetryError, ServerDisconnectedError, TimeoutError, ClientConnectorError):
|
||||
logger.warning(f'Client or timeout error. Retrying {url}')
|
||||
|
||||
|
||||
async def _crawl(url: str, session: aiohttp.ClientSession):
|
||||
logger.info(f'Process {url}')
|
||||
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT, headers=HEADERS) as response:
|
||||
if response.status // 100 == 5:
|
||||
msg = f'Error 5XX. Retrying {url}'
|
||||
logger.warning(msg)
|
||||
raise RetryError(msg)
|
||||
|
||||
if response.status not in {200, 304}:
|
||||
if response.status != 302:
|
||||
content = await response.text()
|
||||
logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}')
|
||||
return
|
||||
|
||||
logger.info(f'Process {url}')
|
||||
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response:
|
||||
if response.status // 100 == 5:
|
||||
msg = f'Error 5XX. Retrying {url}'
|
||||
logger.warning(msg)
|
||||
raise RetryError(msg)
|
||||
# bypass external slashes and so on
|
||||
url_parts = [p for p in url.split('/') if p not in ILLEGAL_PATH_CHARS]
|
||||
|
||||
if response.status not in {200, 304}:
|
||||
if response.status != 302:
|
||||
content = await response.text()
|
||||
logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}')
|
||||
return
|
||||
is_hashable_only = is_hashable_only_content_type(response.content_type)
|
||||
# amazing dirt for media files like
|
||||
# telegram.org/file/811140591/1/q7zZHjgES6s/9d121a89ffb0015837
|
||||
# with response content type HTML instead of image. Shame on you
|
||||
# sometimes it returns correct type. noice load balancing
|
||||
is_sucking_file = '/file/' in url and 'text' in response.content_type
|
||||
|
||||
# bypass external slashes and so on
|
||||
url_parts = [p for p in url.split('/') if p not in ILLEGAL_PATH_CHARS]
|
||||
# handle pure domains and html pages without ext in url
|
||||
ext = '.html' if '.' not in url_parts[-1] or len(url_parts) == 1 else ''
|
||||
|
||||
is_hashable_only = is_hashable_only_content_type(response.content_type)
|
||||
# amazing dirt for media files like
|
||||
# telegram.org/file/811140591/1/q7zZHjgES6s/9d121a89ffb0015837
|
||||
# with response content type HTML instead of image. Shame on you
|
||||
# sometimes it returns correct type. noice load balancing
|
||||
is_sucking_file = '/file/' in url and 'text' in response.content_type
|
||||
# I don't add ext by content type for images and so on cuz TG servers sucks.
|
||||
# Some servers do not return correct content type. Some servers do...
|
||||
if is_hashable_only or is_sucking_file:
|
||||
ext = ''
|
||||
|
||||
# handle pure domains and html pages without ext in url
|
||||
ext = '.html' if '.' not in url_parts[-1] or len(url_parts) == 1 else ''
|
||||
|
||||
# I don't add ext by content type for images and so on cuz TG servers sucks.
|
||||
# Some servers do not return correct content type. Some servers do...
|
||||
if is_hashable_only or is_sucking_file:
|
||||
ext = ''
|
||||
|
||||
filename = OUTPUT_FOLDER + '/'.join(url_parts) + ext
|
||||
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
||||
|
||||
if is_sucking_file or is_hashable_only:
|
||||
content = await response.read()
|
||||
async with aiofiles.open(filename, 'w', encoding='utf-8') as f:
|
||||
await f.write(get_hash(content))
|
||||
return
|
||||
|
||||
content = await response.text(encoding='UTF-8')
|
||||
if re.search(TRANSLATIONS_EN_CATEGORY_URL_REGEX, url):
|
||||
content = await collect_translations_paginated_content(url, session)
|
||||
|
||||
content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
|
||||
content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content)
|
||||
content = re.sub(PASSPORT_SSID_REGEX, PASSPORT_SSID_TEMPLATE, content)
|
||||
content = re.sub(NONCE_REGEX, NONCE_TEMPLATE, content)
|
||||
content = re.sub(PROXY_CONFIG_SUB_NET_REGEX, PROXY_CONFIG_SUB_NET_TEMPLATE, content)
|
||||
content = re.sub(TRANSLATE_SUGGESTION_REGEX, '', content)
|
||||
content = re.sub(SPARKLE_SIG_REGEX, SPARKLE_SIG_TEMPLATE, content)
|
||||
content = re.sub(SPARKLE_SE_REGEX, SPARKLE_SE_TEMPLATE, content)
|
||||
filename = OUTPUT_FOLDER + '/'.join(url_parts) + ext
|
||||
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
||||
|
||||
if is_sucking_file or is_hashable_only:
|
||||
content = await response.read()
|
||||
async with aiofiles.open(filename, 'w', encoding='utf-8') as f:
|
||||
logger.info(f'Write to {filename}')
|
||||
await f.write(content)
|
||||
except (ServerDisconnectedError, TimeoutError, ClientConnectorError):
|
||||
logger.warning(f'Client or timeout error. Retrying {url}')
|
||||
await crawl(url, session)
|
||||
await f.write(get_hash(content))
|
||||
return
|
||||
|
||||
content = await response.text(encoding='UTF-8')
|
||||
if re.search(TRANSLATIONS_EN_CATEGORY_URL_REGEX, url):
|
||||
content = await collect_translations_paginated_content(url, session)
|
||||
|
||||
content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
|
||||
content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content)
|
||||
content = re.sub(PASSPORT_SSID_REGEX, PASSPORT_SSID_TEMPLATE, content)
|
||||
content = re.sub(NONCE_REGEX, NONCE_TEMPLATE, content)
|
||||
content = re.sub(PROXY_CONFIG_SUB_NET_REGEX, PROXY_CONFIG_SUB_NET_TEMPLATE, content)
|
||||
content = re.sub(TRANSLATE_SUGGESTION_REGEX, '', content)
|
||||
content = re.sub(SPARKLE_SIG_REGEX, SPARKLE_SIG_TEMPLATE, content)
|
||||
content = re.sub(SPARKLE_SE_REGEX, SPARKLE_SE_TEMPLATE, content)
|
||||
|
||||
async with aiofiles.open(filename, 'w', encoding='utf-8') as f:
|
||||
logger.info(f'Write to {filename}')
|
||||
await f.write(content)
|
||||
|
||||
|
||||
async def start(url_list: set[str], mode: int):
|
||||
|
|
|
@ -137,9 +137,27 @@ DOM_ATTRS = ['href', 'src']
|
|||
OUTPUT_FILENAME = os.environ.get('OUTPUT_FILENAME', 'tracked_links.txt')
|
||||
COMPARE_OUTPUT_WITH_FILENAME = os.environ.get('COMPARE_OUTPUT_WITH_FILENAME', OUTPUT_FILENAME)
|
||||
|
||||
stel_dev_layer = 132
|
||||
|
||||
# unsecure but so simple
|
||||
CONNECTOR = aiohttp.TCPConnector(ssl=False)
|
||||
CONNECTOR = aiohttp.TCPConnector(ssl=False, force_close=True, limit=300)
|
||||
TIMEOUT = aiohttp.ClientTimeout(total=10)
|
||||
HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:99.0) Gecko/20100101 Firefox/99.0',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'en-US,en;q=0.5',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'DNT': '1',
|
||||
'Connection': 'keep-alive',
|
||||
'Cookie': f'stel_ln=en; stel_dev_layer={stel_dev_layer}',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Cache-Control': 'max-age=0',
|
||||
'TE': 'trailers',
|
||||
}
|
||||
|
||||
logging.basicConfig(format='%(message)s', level=logging.DEBUG)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -275,7 +293,22 @@ def is_trackable_content_type(content_type) -> bool:
|
|||
return False
|
||||
|
||||
|
||||
class ServerSideError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
async def crawl(url: str, session: aiohttp.ClientSession):
|
||||
ok = False
|
||||
while not ok:
|
||||
try:
|
||||
await _crawl(url, session)
|
||||
ok = True
|
||||
except (ServerSideError, ServerDisconnectedError, TimeoutError, ClientConnectorError):
|
||||
logger.warning(f'Client or timeout error. Retrying {url}')
|
||||
VISITED_LINKS.remove(url)
|
||||
|
||||
|
||||
async def _crawl(url: str, session: aiohttp.ClientSession):
|
||||
if url in VISITED_LINKS:
|
||||
return
|
||||
VISITED_LINKS.add(url)
|
||||
|
@ -288,7 +321,7 @@ async def crawl(url: str, session: aiohttp.ClientSession):
|
|||
if response.status // 100 == 5:
|
||||
VISITED_LINKS.remove(url)
|
||||
logger.warning(f'Error 5XX. Retrying {url}')
|
||||
return await crawl(url, session)
|
||||
raise ServerSideError()
|
||||
|
||||
if response.status not in {200, 304}:
|
||||
if response.status != 302:
|
||||
|
@ -327,15 +360,10 @@ async def crawl(url: str, session: aiohttp.ClientSession):
|
|||
logger.warning(f'Codec can\'t decode bytes. So it was a tgs file or response with broken content type {url}')
|
||||
# except ClientConnectorError:
|
||||
# logger.warning(f'Wrong link: {url}')
|
||||
except (ServerDisconnectedError, TimeoutError, ClientConnectorError):
|
||||
logger.warning(f'Client or timeout error. Retrying {url}')
|
||||
VISITED_LINKS.remove(url)
|
||||
# sleep + count of attempts?
|
||||
await crawl(url, session)
|
||||
|
||||
|
||||
async def start(url_list: set[str]):
|
||||
async with aiohttp.ClientSession(connector=CONNECTOR) as session:
|
||||
async with aiohttp.ClientSession(connector=CONNECTOR, headers=HEADERS) as session:
|
||||
await asyncio.gather(*[crawl(url, session) for url in url_list])
|
||||
|
||||
|
||||
|
|
|
@ -2,4 +2,5 @@ aiohttp==3.7.4.post0
|
|||
aiodns==3.0.0
|
||||
aiofiles==0.6.0
|
||||
Pyrogram==2.0.19
|
||||
TgCrypto==1.2.3
|
||||
# uvloop==0.16.0
|
Loading…
Add table
Reference in a new issue