mirror of
https://github.com/MarshalX/telegram-crawler.git
synced 2025-03-27 18:20:44 +01:00
add tracking of all en translations strings
This commit is contained in:
parent
8efff578c3
commit
6932a49c2a
3 changed files with 95 additions and 5 deletions
|
@ -18,6 +18,8 @@ DYNAMIC_PART_MOCK = 'telegram-crawler'
|
|||
INPUT_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_links.txt')
|
||||
OUTPUT_FOLDER = os.environ.get('OUTPUT_FOLDER', 'data/')
|
||||
|
||||
TRANSLATIONS_EN_CATEGORY_URL_REGEX = r'/en/[a-z_]+/[a-z_]+/$'
|
||||
|
||||
PAGE_GENERATION_TIME_REGEX = r'<!-- page generated in .+ -->'
|
||||
PAGE_API_HASH_REGEX = r'\?hash=[a-z0-9]+'
|
||||
PAGE_API_HASH_TEMPLATE = f'?hash={DYNAMIC_PART_MOCK}'
|
||||
|
@ -30,16 +32,45 @@ PROXY_CONFIG_SUB_NET_TEMPLATE = 'X.X:8888;'
|
|||
|
||||
# unsecure but so simple
|
||||
CONNECTOR = aiohttp.TCPConnector(ssl=False)
|
||||
TIMEOUT = aiohttp.ClientTimeout(total=30)
|
||||
TIMEOUT = aiohttp.ClientTimeout(total=10)
|
||||
|
||||
logging.basicConfig(format='%(message)s', level=logging.DEBUG)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def collect_translations_paginated_content(url: str, session: aiohttp.ClientSession) -> str:
|
||||
headers = {'X-Requested-With': 'XMLHttpRequest'}
|
||||
content = list()
|
||||
|
||||
async def _get_page(offset: int):
|
||||
logger.info(f'Url: {url}, offset: {offset}')
|
||||
data = {'offset': offset, 'more': 1}
|
||||
|
||||
try:
|
||||
async with session.post(
|
||||
f'{PROTOCOL}{url}', data=data, headers=headers, allow_redirects=False, timeout=TIMEOUT
|
||||
) as response:
|
||||
if response.status != 200:
|
||||
logger.debug(f'Resend cuz {response.status}')
|
||||
return await asyncio.gather(_get_page(offset))
|
||||
|
||||
json = await response.json(encoding='UTF-8')
|
||||
if 'more_html' in json and json['more_html']:
|
||||
content.append(json['more_html'])
|
||||
await asyncio.gather(_get_page(offset + 200))
|
||||
except (TimeoutError, ClientConnectorError):
|
||||
logger.warning(f'Client or timeout error. Retrying {url}; offset {offset}')
|
||||
await asyncio.gather(_get_page(offset))
|
||||
|
||||
await _get_page(0)
|
||||
|
||||
return '\n'.join(content)
|
||||
|
||||
|
||||
async def crawl(url: str, session: aiohttp.ClientSession):
|
||||
try:
|
||||
logger.info(f'Process {url}')
|
||||
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False) as response:
|
||||
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response:
|
||||
if response.status // 100 == 5:
|
||||
logger.warning(f'Error 5XX. Retrying {url}')
|
||||
return await asyncio.gather(crawl(url, session))
|
||||
|
@ -57,9 +88,12 @@ async def crawl(url: str, session: aiohttp.ClientSession):
|
|||
|
||||
filename = OUTPUT_FOLDER + '/'.join(url_parts) + ext
|
||||
|
||||
content = await response.text(encoding='UTF-8')
|
||||
if re.search(TRANSLATIONS_EN_CATEGORY_URL_REGEX, url):
|
||||
content = await collect_translations_paginated_content(url, session)
|
||||
|
||||
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
||||
async with aiofiles.open(filename, 'w') as f:
|
||||
content = await response.text(encoding='UTF-8')
|
||||
content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
|
||||
content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content)
|
||||
content = re.sub(PASSPORT_SSID_REGEX, PASSPORT_SSID_TEMPLATE, content)
|
||||
|
|
|
@ -58,7 +58,8 @@ CRAWL_RULES = {
|
|||
'allow': {
|
||||
r'^[^/]*$', # root
|
||||
r'org/[^/]*/$', # 1 lvl sub
|
||||
r'/en/[a-z_]+/$' # 1 lvl after /en/
|
||||
r'/en/[a-z_]+/$', # 1 lvl after /en/
|
||||
r'/en/[a-z_]+/[a-z_]+/$', # 2 lvl after /en/. for example, /en/ios/unsorted/
|
||||
},
|
||||
'deny': {
|
||||
'', # all
|
||||
|
@ -130,7 +131,7 @@ COMPARE_OUTPUT_WITH_FILENAME = os.environ.get('COMPARE_OUTPUT_WITH_FILENAME', OU
|
|||
|
||||
# unsecure but so simple
|
||||
CONNECTOR = aiohttp.TCPConnector(ssl=False)
|
||||
TIMEOUT = aiohttp.ClientTimeout(total=30)
|
||||
TIMEOUT = aiohttp.ClientTimeout(total=10)
|
||||
|
||||
logging.basicConfig(format='%(message)s', level=logging.DEBUG)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
|
|
@ -3989,10 +3989,65 @@ themes.telegram.org/js/themes.js
|
|||
translations.telegram.org
|
||||
translations.telegram.org/en/
|
||||
translations.telegram.org/en/android/
|
||||
translations.telegram.org/en/android/bots_and_payments/
|
||||
translations.telegram.org/en/android/camera_and_media/
|
||||
translations.telegram.org/en/android/chat_list/
|
||||
translations.telegram.org/en/android/general/
|
||||
translations.telegram.org/en/android/groups_and_channels/
|
||||
translations.telegram.org/en/android/login/
|
||||
translations.telegram.org/en/android/passport/
|
||||
translations.telegram.org/en/android/private_chats/
|
||||
translations.telegram.org/en/android/profile/
|
||||
translations.telegram.org/en/android/settings/
|
||||
translations.telegram.org/en/android/unsorted/
|
||||
translations.telegram.org/en/android_x/
|
||||
translations.telegram.org/en/android_x/bots_and_payments/
|
||||
translations.telegram.org/en/android_x/camera_and_media/
|
||||
translations.telegram.org/en/android_x/chat_list/
|
||||
translations.telegram.org/en/android_x/general/
|
||||
translations.telegram.org/en/android_x/groups_and_channels/
|
||||
translations.telegram.org/en/android_x/login/
|
||||
translations.telegram.org/en/android_x/passport/
|
||||
translations.telegram.org/en/android_x/private_chats/
|
||||
translations.telegram.org/en/android_x/profile/
|
||||
translations.telegram.org/en/android_x/settings/
|
||||
translations.telegram.org/en/android_x/unsorted/
|
||||
translations.telegram.org/en/ios/
|
||||
translations.telegram.org/en/ios/bots_and_payments/
|
||||
translations.telegram.org/en/ios/camera_and_media/
|
||||
translations.telegram.org/en/ios/chat_list/
|
||||
translations.telegram.org/en/ios/general/
|
||||
translations.telegram.org/en/ios/groups_and_channels/
|
||||
translations.telegram.org/en/ios/login/
|
||||
translations.telegram.org/en/ios/passport/
|
||||
translations.telegram.org/en/ios/private_chats/
|
||||
translations.telegram.org/en/ios/profile/
|
||||
translations.telegram.org/en/ios/settings/
|
||||
translations.telegram.org/en/ios/unsorted/
|
||||
translations.telegram.org/en/macos/
|
||||
translations.telegram.org/en/macos/bots_and_payments/
|
||||
translations.telegram.org/en/macos/camera_and_media/
|
||||
translations.telegram.org/en/macos/chat_list/
|
||||
translations.telegram.org/en/macos/general/
|
||||
translations.telegram.org/en/macos/groups_and_channels/
|
||||
translations.telegram.org/en/macos/login/
|
||||
translations.telegram.org/en/macos/passport/
|
||||
translations.telegram.org/en/macos/private_chats/
|
||||
translations.telegram.org/en/macos/profile/
|
||||
translations.telegram.org/en/macos/settings/
|
||||
translations.telegram.org/en/macos/unsorted/
|
||||
translations.telegram.org/en/tdesktop/
|
||||
translations.telegram.org/en/tdesktop/bots_and_payments/
|
||||
translations.telegram.org/en/tdesktop/camera_and_media/
|
||||
translations.telegram.org/en/tdesktop/chat_list/
|
||||
translations.telegram.org/en/tdesktop/general/
|
||||
translations.telegram.org/en/tdesktop/groups_and_channels/
|
||||
translations.telegram.org/en/tdesktop/login/
|
||||
translations.telegram.org/en/tdesktop/passport/
|
||||
translations.telegram.org/en/tdesktop/private_chats/
|
||||
translations.telegram.org/en/tdesktop/profile/
|
||||
translations.telegram.org/en/tdesktop/settings/
|
||||
translations.telegram.org/en/tdesktop/unsorted/
|
||||
tsf.telegram.org
|
||||
tsf.telegram.org/auth
|
||||
tsf.telegram.org/css/billboard.css
|
||||
|
|
Loading…
Add table
Reference in a new issue