mirror of
https://github.com/MarshalX/telegram-crawler.git
synced 2025-01-05 18:38:19 +01:00
track fragment
This commit is contained in:
parent
194ff9888c
commit
2dc60397dc
2 changed files with 7 additions and 3 deletions
|
@ -38,6 +38,8 @@ TRANSLATIONS_EN_CATEGORY_URL_REGEX = r'/en/[a-z_]+/[a-z_]+/$'
|
|||
PAGE_GENERATION_TIME_REGEX = r'<!-- page generated in .+ -->'
|
||||
PAGE_API_HASH_REGEX = r'\?hash=[a-z0-9]+'
|
||||
PAGE_API_HASH_TEMPLATE = f'?hash={DYNAMIC_PART_MOCK}'
|
||||
TON_RATE_REGEX = r'"tonRate":"[.0-9]+"'
|
||||
TON_RATE_TEMPLATE = f'"tonRate":"{DYNAMIC_PART_MOCK}"'
|
||||
PASSPORT_SSID_REGEX = r'passport_ssid=[a-z0-9]+_[a-z0-9]+_[a-z0-9]+'
|
||||
PASSPORT_SSID_TEMPLATE = f'passport_ssid={DYNAMIC_PART_MOCK}'
|
||||
NONCE_REGEX = r'"nonce":"[a-z0-9]+_[a-z0-9]+_[a-z0-9]+'
|
||||
|
@ -633,6 +635,7 @@ async def _crawl(url: str, session: aiohttp.ClientSession, output_dir: str):
|
|||
content = re.sub(PROXY_CONFIG_SUB_NET_REGEX, PROXY_CONFIG_SUB_NET_TEMPLATE, content)
|
||||
content = re.sub(SPARKLE_SIG_REGEX, SPARKLE_SIG_TEMPLATE, content)
|
||||
content = re.sub(SPARKLE_SE_REGEX, SPARKLE_SE_TEMPLATE, content)
|
||||
content = re.sub(TON_RATE_REGEX, TON_RATE_TEMPLATE, content)
|
||||
|
||||
async with aiofiles.open(filename, 'w', encoding='utf-8') as f:
|
||||
logger.info(f'Write to {filename}')
|
||||
|
|
|
@ -50,9 +50,10 @@ HIDDEN_URLS = {
|
|||
# 'a-webappcontent.stel.com/demo',
|
||||
# 'a-webappcontent.stel.com/cafe',
|
||||
|
||||
# 'fragment.com/about',
|
||||
# 'fragment.com/privacy',
|
||||
# 'fragment.com/terms',
|
||||
'fragment.com/about',
|
||||
'fragment.com/privacy',
|
||||
'fragment.com/terms',
|
||||
'fragment.com/js/auction.js',
|
||||
}
|
||||
ADDITIONAL_URLS = {
|
||||
'raw.githubusercontent.com/telegramdesktop/tdesktop/dev/Telegram/Resources/tl/mtproto.tl',
|
||||
|
|
Loading…
Reference in a new issue