add tracking of all en translations strings

This commit is contained in:
Il'ya (Marshal) 2022-03-12 01:20:05 +01:00
parent 8efff578c3
commit 6932a49c2a
3 changed files with 95 additions and 5 deletions

View file

@ -18,6 +18,8 @@ DYNAMIC_PART_MOCK = 'telegram-crawler'
INPUT_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_links.txt')
OUTPUT_FOLDER = os.environ.get('OUTPUT_FOLDER', 'data/')
TRANSLATIONS_EN_CATEGORY_URL_REGEX = r'/en/[a-z_]+/[a-z_]+/$'
PAGE_GENERATION_TIME_REGEX = r'<!-- page generated in .+ -->'
PAGE_API_HASH_REGEX = r'\?hash=[a-z0-9]+'
PAGE_API_HASH_TEMPLATE = f'?hash={DYNAMIC_PART_MOCK}'
@ -30,16 +32,45 @@ PROXY_CONFIG_SUB_NET_TEMPLATE = 'X.X:8888;'
# unsecure but so simple
CONNECTOR = aiohttp.TCPConnector(ssl=False)
TIMEOUT = aiohttp.ClientTimeout(total=30)
TIMEOUT = aiohttp.ClientTimeout(total=10)
logging.basicConfig(format='%(message)s', level=logging.DEBUG)
logger = logging.getLogger(__name__)
async def collect_translations_paginated_content(url: str, session: aiohttp.ClientSession) -> str:
headers = {'X-Requested-With': 'XMLHttpRequest'}
content = list()
async def _get_page(offset: int):
logger.info(f'Url: {url}, offset: {offset}')
data = {'offset': offset, 'more': 1}
try:
async with session.post(
f'{PROTOCOL}{url}', data=data, headers=headers, allow_redirects=False, timeout=TIMEOUT
) as response:
if response.status != 200:
logger.debug(f'Resend cuz {response.status}')
return await asyncio.gather(_get_page(offset))
json = await response.json(encoding='UTF-8')
if 'more_html' in json and json['more_html']:
content.append(json['more_html'])
await asyncio.gather(_get_page(offset + 200))
except (TimeoutError, ClientConnectorError):
logger.warning(f'Client or timeout error. Retrying {url}; offset {offset}')
await asyncio.gather(_get_page(offset))
await _get_page(0)
return '\n'.join(content)
async def crawl(url: str, session: aiohttp.ClientSession):
try:
logger.info(f'Process {url}')
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False) as response:
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False, timeout=TIMEOUT) as response:
if response.status // 100 == 5:
logger.warning(f'Error 5XX. Retrying {url}')
return await asyncio.gather(crawl(url, session))
@ -57,9 +88,12 @@ async def crawl(url: str, session: aiohttp.ClientSession):
filename = OUTPUT_FOLDER + '/'.join(url_parts) + ext
content = await response.text(encoding='UTF-8')
if re.search(TRANSLATIONS_EN_CATEGORY_URL_REGEX, url):
content = await collect_translations_paginated_content(url, session)
os.makedirs(os.path.dirname(filename), exist_ok=True)
async with aiofiles.open(filename, 'w') as f:
content = await response.text(encoding='UTF-8')
content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content)
content = re.sub(PASSPORT_SSID_REGEX, PASSPORT_SSID_TEMPLATE, content)

View file

@ -58,7 +58,8 @@ CRAWL_RULES = {
'allow': {
r'^[^/]*$', # root
r'org/[^/]*/$', # 1 lvl sub
r'/en/[a-z_]+/$' # 1 lvl after /en/
r'/en/[a-z_]+/$', # 1 lvl after /en/
r'/en/[a-z_]+/[a-z_]+/$', # 2 lvl after /en/. for example, /en/ios/unsorted/
},
'deny': {
'', # all
@ -130,7 +131,7 @@ COMPARE_OUTPUT_WITH_FILENAME = os.environ.get('COMPARE_OUTPUT_WITH_FILENAME', OU
# unsecure but so simple
CONNECTOR = aiohttp.TCPConnector(ssl=False)
TIMEOUT = aiohttp.ClientTimeout(total=30)
TIMEOUT = aiohttp.ClientTimeout(total=10)
logging.basicConfig(format='%(message)s', level=logging.DEBUG)
logger = logging.getLogger(__name__)

View file

@ -3989,10 +3989,65 @@ themes.telegram.org/js/themes.js
translations.telegram.org
translations.telegram.org/en/
translations.telegram.org/en/android/
translations.telegram.org/en/android/bots_and_payments/
translations.telegram.org/en/android/camera_and_media/
translations.telegram.org/en/android/chat_list/
translations.telegram.org/en/android/general/
translations.telegram.org/en/android/groups_and_channels/
translations.telegram.org/en/android/login/
translations.telegram.org/en/android/passport/
translations.telegram.org/en/android/private_chats/
translations.telegram.org/en/android/profile/
translations.telegram.org/en/android/settings/
translations.telegram.org/en/android/unsorted/
translations.telegram.org/en/android_x/
translations.telegram.org/en/android_x/bots_and_payments/
translations.telegram.org/en/android_x/camera_and_media/
translations.telegram.org/en/android_x/chat_list/
translations.telegram.org/en/android_x/general/
translations.telegram.org/en/android_x/groups_and_channels/
translations.telegram.org/en/android_x/login/
translations.telegram.org/en/android_x/passport/
translations.telegram.org/en/android_x/private_chats/
translations.telegram.org/en/android_x/profile/
translations.telegram.org/en/android_x/settings/
translations.telegram.org/en/android_x/unsorted/
translations.telegram.org/en/ios/
translations.telegram.org/en/ios/bots_and_payments/
translations.telegram.org/en/ios/camera_and_media/
translations.telegram.org/en/ios/chat_list/
translations.telegram.org/en/ios/general/
translations.telegram.org/en/ios/groups_and_channels/
translations.telegram.org/en/ios/login/
translations.telegram.org/en/ios/passport/
translations.telegram.org/en/ios/private_chats/
translations.telegram.org/en/ios/profile/
translations.telegram.org/en/ios/settings/
translations.telegram.org/en/ios/unsorted/
translations.telegram.org/en/macos/
translations.telegram.org/en/macos/bots_and_payments/
translations.telegram.org/en/macos/camera_and_media/
translations.telegram.org/en/macos/chat_list/
translations.telegram.org/en/macos/general/
translations.telegram.org/en/macos/groups_and_channels/
translations.telegram.org/en/macos/login/
translations.telegram.org/en/macos/passport/
translations.telegram.org/en/macos/private_chats/
translations.telegram.org/en/macos/profile/
translations.telegram.org/en/macos/settings/
translations.telegram.org/en/macos/unsorted/
translations.telegram.org/en/tdesktop/
translations.telegram.org/en/tdesktop/bots_and_payments/
translations.telegram.org/en/tdesktop/camera_and_media/
translations.telegram.org/en/tdesktop/chat_list/
translations.telegram.org/en/tdesktop/general/
translations.telegram.org/en/tdesktop/groups_and_channels/
translations.telegram.org/en/tdesktop/login/
translations.telegram.org/en/tdesktop/passport/
translations.telegram.org/en/tdesktop/private_chats/
translations.telegram.org/en/tdesktop/profile/
translations.telegram.org/en/tdesktop/settings/
translations.telegram.org/en/tdesktop/unsorted/
tsf.telegram.org
tsf.telegram.org/auth
tsf.telegram.org/css/billboard.css