separate web and translations to web_tr

This commit is contained in:
Il'ya (Marshal) 2022-06-18 17:36:39 +02:00
parent d0c78eaff4
commit 35c5ca8773
7 changed files with 135 additions and 87 deletions

View file

@ -26,6 +26,9 @@ jobs:
- mode: web_res
os: ubuntu-20.04
- mode: web_tr
os: ubuntu-20.04
- mode: server
os: ubuntu-20.04
@ -68,6 +71,7 @@ jobs:
git pull
mv data/web_res data_ci/web_res
mv data/web_tr data_ci/web_tr
mv data/client data_ci/client
mv data/server data_ci/server
@ -81,12 +85,27 @@ jobs:
git pull
mv data/web data_ci/web
mv data/web_tr data_ci/web_tr
mv data/client data_ci/client
mv data/server data_ci/server
rm -rf data
mv data_ci data
- name: Prepare data.
if: matrix.mode == 'web_tr'
run: |
git checkout data
git pull
mv data/web data_ci/web
mv data/web_res data_ci/web_res
mv data/server data_ci/server
mv data/client data_ci/client
rm -rf data
mv data_ci data
- name: Prepare data.
if: matrix.mode == 'server'
run: |
@ -95,6 +114,7 @@ jobs:
mv data/web data_ci/web
mv data/web_res data_ci/web_res
mv data/web_tr data_ci/web_tr
mv data/client data_ci/client
rm -rf data
@ -108,6 +128,7 @@ jobs:
mv data/web data_ci/web
mv data/web_res data_ci/web_res
mv data/web_tr data_ci/web_tr
mv data/server data_ci/server
rm -rf data

View file

@ -34,7 +34,8 @@ jobs:
- name: Generate/update file with links.
env:
OUTPUT_FILENAME: "tracked_links_ci.txt"
COMPARE_OUTPUT_WITH_FILENAME: "tracked_links.txt"
OUTPUT_RESOURCES_FILENAME: "tracked_res_links_ci.txt"
OUTPUT_TRANSLATIONS_FILENAME: "tracked_tr_links_ci.txt"
run: |
python make_tracked_links_list.py
@ -43,6 +44,8 @@ jobs:
git pull
mv tracked_links_ci.txt tracked_links.txt
mv tracked_res_links_ci.txt tracked_res_links.txt
mv tracked_tr_links_ci.txt tracked_tr_links.txt
git config --global user.email "github-action@users.noreply.github.com"
git config --global user.name "GitHub Action"

View file

@ -41,10 +41,11 @@ STATUS_TO_EMOJI = {
}
AVAILABLE_HASHTAGS = {
'web_res', 'web', 'server', 'test_server', 'client', 'ios', 'macos', 'android', 'translations'
'web_tr', 'web_res', 'web', 'server', 'test_server', 'client', 'ios', 'macos', 'android'
}
HASHTAGS_PATTERNS = {
# regex will be more flexible. for example, in issue with double hashtag '#web #web_res' when data/res not changed
'web_tr': os.path.join(ROOT_TREE_DIR, 'web_tr'),
'web_res': os.path.join(ROOT_TREE_DIR, 'web_res'),
'web': os.path.join(ROOT_TREE_DIR, 'web'),
'server': os.path.join(ROOT_TREE_DIR, 'server'),
@ -53,10 +54,10 @@ HASHTAGS_PATTERNS = {
'ios': os.path.join(ROOT_TREE_DIR, 'client', 'ios-beta'),
'macos': os.path.join(ROOT_TREE_DIR, 'client', 'macos-beta'),
'android': os.path.join(ROOT_TREE_DIR, 'client', 'android-beta'),
'translations': os.path.join(ROOT_TREE_DIR, 'web', 'translations.telegram.org'),
}
# order is important!
PATHS_TO_REMOVE_FROM_ALERT = [
os.path.join(ROOT_TREE_DIR, 'web_tr'),
os.path.join(ROOT_TREE_DIR, 'web_res'),
os.path.join(ROOT_TREE_DIR, 'web'),
os.path.join(ROOT_TREE_DIR, 'server'),

View file

@ -25,11 +25,13 @@ DYNAMIC_PART_MOCK = 'telegram-crawler'
INPUT_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_links.txt')
INPUT_RES_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_res_links.txt')
INPUT_TR_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_tr_links.txt')
OUTPUT_FOLDER = os.environ.get('OUTPUT_FOLDER', 'data/')
OUTPUT_MTPROTO_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_MTPROTO_FOLDER', 'server/'))
OUTPUT_SITES_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_SITES_FOLDER', 'web/'))
OUTPUT_CLIENTS_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_CLIENTS_FOLDER', 'client/'))
OUTPUT_RESOURCES_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_RESOURCES_FOLDER', 'web_res/'))
OUTPUT_TRANSLATIONS_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_RESOURCES_FOLDER', 'web_tr/'))
TRANSLATIONS_EN_CATEGORY_URL_REGEX = r'/en/[a-z_]+/[a-z_]+/$'
@ -514,7 +516,7 @@ class RetryError(Exception):
...
async def crawl(url: str, session: aiohttp.ClientSession, output_dir: str = OUTPUT_SITES_FOLDER):
async def crawl(url: str, session: aiohttp.ClientSession, output_dir: str):
while True:
try:
await _crawl(url, session, output_dir)
@ -585,18 +587,23 @@ async def _crawl(url: str, session: aiohttp.ClientSession, output_dir: str):
await f.write(content)
async def crawl_web(session: aiohttp.ClientSession):
with open(INPUT_FILENAME, 'r') as f:
async def _crawl_web(session: aiohttp.ClientSession, input_filename: str, output_folder=None):
with open(input_filename, 'r') as f:
tracked_urls = set([l.replace('\n', '') for l in f.readlines()])
await asyncio.gather(*[crawl(url, session) for url in tracked_urls])
await asyncio.gather(*[crawl(url, session, output_folder) for url in tracked_urls])
async def crawl_web(session: aiohttp.ClientSession):
await _crawl_web(session, INPUT_FILENAME, OUTPUT_SITES_FOLDER)
async def crawl_web_res(session: aiohttp.ClientSession):
with open(INPUT_RES_FILENAME, 'r') as f:
tracked_urls = set([l.replace('\n', '') for l in f.readlines()])
await _crawl_web(session, INPUT_RES_FILENAME, OUTPUT_RESOURCES_FOLDER)
await asyncio.gather(*[crawl(url, session, OUTPUT_RESOURCES_FOLDER) for url in tracked_urls])
async def crawl_web_tr(session: aiohttp.ClientSession):
await _crawl_web(session, INPUT_TR_FILENAME, OUTPUT_TRANSLATIONS_FOLDER)
async def start(mode: str):
@ -604,6 +611,7 @@ async def start(mode: str):
mode == 'all' and await asyncio.gather(
crawl_web(session),
crawl_web_res(session),
crawl_web_tr(session),
track_mtproto_configs(),
download_telegram_android_beta_and_extract_resources(session),
download_telegram_macos_beta_and_extract_resources(session),
@ -615,6 +623,9 @@ async def start(mode: str):
mode == 'web_res' and await asyncio.gather(
crawl_web_res(session),
)
mode == 'web_tr' and await asyncio.gather(
crawl_web_tr(session),
)
mode == 'server' and await asyncio.gather(
track_mtproto_configs(),
)

View file

@ -148,6 +148,7 @@ DOM_ATTRS = ['href', 'src']
OUTPUT_FILENAME = os.environ.get('OUTPUT_FILENAME', 'tracked_links.txt')
OUTPUT_RESOURCES_FILENAME = os.environ.get('OUTPUT_RESOURCES_FILENAME', 'tracked_res_links.txt')
OUTPUT_TRANSLATIONS_FILENAME = os.environ.get('OUTPUT_TRANSLATIONS_FILENAME', 'tracked_tr_links.txt')
STEL_DEV_LAYER = 190
@ -176,6 +177,7 @@ logger = logging.getLogger(__name__)
VISITED_LINKS = set()
LINKS_TO_TRACK = set()
LINKS_TO_TRANSLATIONS = set()
LINKS_TO_TRACKABLE_RESOURCES = set()
@ -291,6 +293,10 @@ def _is_x_content_type(content_types_set: set[str], content_type) -> bool:
return False
def is_translation_url(url: str) -> bool:
return 'translations.telegram.org' in url
def is_textable_content_type(content_type: str) -> bool:
textable_content_type = {
'plain',
@ -356,6 +362,9 @@ async def _crawl(url: str, session: aiohttp.ClientSession):
return
if is_textable_content_type(content_type):
if is_translation_url(url):
LINKS_TO_TRANSLATIONS.add(url)
else:
LINKS_TO_TRACK.add(url)
# raw content will be cached by aiohttp. Don't worry about it
@ -380,10 +389,10 @@ async def _crawl(url: str, session: aiohttp.ClientSession):
# telegram url can work with and without trailing slash (no redirect). P.S. not on every subdomain ;d
# so this is a problem when we have random behavior with link will be added
# this if resolve this issue. If available both link we prefer without trailing slash
for links_set in (LINKS_TO_TRACK, LINKS_TO_TRANSLATIONS, LINKS_TO_TRACKABLE_RESOURCES):
without_trailing_slash = url[:-1:] if url.endswith('/') else url
if without_trailing_slash in LINKS_TO_TRACK and \
f'{without_trailing_slash}/' in LINKS_TO_TRACK:
LINKS_TO_TRACK.remove(f'{without_trailing_slash}/')
if without_trailing_slash in links_set and f'{without_trailing_slash}/' in links_set:
links_set.remove(f'{without_trailing_slash}/')
except UnicodeDecodeError:
logger.warning(f'Codec can\'t decode bytes. So it was a tgs file or response with broken content type {url}')
@ -412,7 +421,7 @@ if __name__ == '__main__':
with open(filename, 'r') as f:
OLD_URL_LIST |= set([l.replace('\n', '') for l in f.readlines()])
CURRENT_URL_LIST = LINKS_TO_TRACK | LINKS_TO_TRACKABLE_RESOURCES
CURRENT_URL_LIST = LINKS_TO_TRACK | LINKS_TO_TRACKABLE_RESOURCES | LINKS_TO_TRANSLATIONS
logger.info(f'Is equal: {OLD_URL_LIST == CURRENT_URL_LIST}')
logger.info(f'Deleted: {OLD_URL_LIST - CURRENT_URL_LIST}')
@ -425,3 +434,6 @@ if __name__ == '__main__':
with open(OUTPUT_RESOURCES_FILENAME, 'w') as f:
f.write('\n'.join(sorted(LINKS_TO_TRACKABLE_RESOURCES)))
with open(OUTPUT_TRANSLATIONS_FILENAME, 'w') as f:
f.write('\n'.join(sorted(LINKS_TO_TRANSLATIONS)))

View file

@ -5916,77 +5916,6 @@ themes.telegram.org/js/jquery-ui.min.js
themes.telegram.org/js/jquery.min.js
themes.telegram.org/js/main-aj.js
themes.telegram.org/js/themes.js
translations.telegram.org
translations.telegram.org/css/billboard.css
translations.telegram.org/css/bootstrap-extra.css
translations.telegram.org/css/bootstrap.min.css
translations.telegram.org/css/contest-zoo.css
translations.telegram.org/css/health.css
translations.telegram.org/css/jquery-ui.min.css
translations.telegram.org/css/tchart.min.css
translations.telegram.org/css/telegram.css
translations.telegram.org/css/translations.css
translations.telegram.org/en/
translations.telegram.org/en/android/
translations.telegram.org/en/android/bots_and_payments/
translations.telegram.org/en/android/camera_and_media/
translations.telegram.org/en/android/chat_list/
translations.telegram.org/en/android/general/
translations.telegram.org/en/android/groups_and_channels/
translations.telegram.org/en/android/login/
translations.telegram.org/en/android/passport/
translations.telegram.org/en/android/private_chats/
translations.telegram.org/en/android/profile/
translations.telegram.org/en/android/settings/
translations.telegram.org/en/android/unsorted/
translations.telegram.org/en/android_x/
translations.telegram.org/en/android_x/bots_and_payments/
translations.telegram.org/en/android_x/camera_and_media/
translations.telegram.org/en/android_x/chat_list/
translations.telegram.org/en/android_x/general/
translations.telegram.org/en/android_x/groups_and_channels/
translations.telegram.org/en/android_x/login/
translations.telegram.org/en/android_x/passport/
translations.telegram.org/en/android_x/private_chats/
translations.telegram.org/en/android_x/profile/
translations.telegram.org/en/android_x/settings/
translations.telegram.org/en/android_x/unsorted/
translations.telegram.org/en/ios/
translations.telegram.org/en/ios/bots_and_payments/
translations.telegram.org/en/ios/camera_and_media/
translations.telegram.org/en/ios/chat_list/
translations.telegram.org/en/ios/general/
translations.telegram.org/en/ios/groups_and_channels/
translations.telegram.org/en/ios/login/
translations.telegram.org/en/ios/passport/
translations.telegram.org/en/ios/private_chats/
translations.telegram.org/en/ios/profile/
translations.telegram.org/en/ios/settings/
translations.telegram.org/en/ios/unsorted/
translations.telegram.org/en/macos/
translations.telegram.org/en/macos/bots_and_payments/
translations.telegram.org/en/macos/camera_and_media/
translations.telegram.org/en/macos/chat_list/
translations.telegram.org/en/macos/general/
translations.telegram.org/en/macos/groups_and_channels/
translations.telegram.org/en/macos/login/
translations.telegram.org/en/macos/passport/
translations.telegram.org/en/macos/private_chats/
translations.telegram.org/en/macos/profile/
translations.telegram.org/en/macos/settings/
translations.telegram.org/en/macos/unsorted/
translations.telegram.org/en/tdesktop/
translations.telegram.org/en/tdesktop/bots_and_payments/
translations.telegram.org/en/tdesktop/camera_and_media/
translations.telegram.org/en/tdesktop/chat_list/
translations.telegram.org/en/tdesktop/general/
translations.telegram.org/en/tdesktop/groups_and_channels/
translations.telegram.org/en/tdesktop/login/
translations.telegram.org/en/tdesktop/passport/
translations.telegram.org/en/tdesktop/private_chats/
translations.telegram.org/en/tdesktop/profile/
translations.telegram.org/en/tdesktop/settings/
translations.telegram.org/en/tdesktop/unsorted/
tsf.telegram.org
tsf.telegram.org/auth
tsf.telegram.org/css/billboard.css

71
tracked_tr_links.txt Normal file
View file

@ -0,0 +1,71 @@
translations.telegram.org
translations.telegram.org/css/billboard.css
translations.telegram.org/css/bootstrap-extra.css
translations.telegram.org/css/bootstrap.min.css
translations.telegram.org/css/contest-zoo.css
translations.telegram.org/css/health.css
translations.telegram.org/css/jquery-ui.min.css
translations.telegram.org/css/tchart.min.css
translations.telegram.org/css/telegram.css
translations.telegram.org/css/translations.css
translations.telegram.org/en/
translations.telegram.org/en/android/
translations.telegram.org/en/android/bots_and_payments/
translations.telegram.org/en/android/camera_and_media/
translations.telegram.org/en/android/chat_list/
translations.telegram.org/en/android/general/
translations.telegram.org/en/android/groups_and_channels/
translations.telegram.org/en/android/login/
translations.telegram.org/en/android/passport/
translations.telegram.org/en/android/private_chats/
translations.telegram.org/en/android/profile/
translations.telegram.org/en/android/settings/
translations.telegram.org/en/android/unsorted/
translations.telegram.org/en/android_x/
translations.telegram.org/en/android_x/bots_and_payments/
translations.telegram.org/en/android_x/camera_and_media/
translations.telegram.org/en/android_x/chat_list/
translations.telegram.org/en/android_x/general/
translations.telegram.org/en/android_x/groups_and_channels/
translations.telegram.org/en/android_x/login/
translations.telegram.org/en/android_x/passport/
translations.telegram.org/en/android_x/private_chats/
translations.telegram.org/en/android_x/profile/
translations.telegram.org/en/android_x/settings/
translations.telegram.org/en/android_x/unsorted/
translations.telegram.org/en/ios/
translations.telegram.org/en/ios/bots_and_payments/
translations.telegram.org/en/ios/camera_and_media/
translations.telegram.org/en/ios/chat_list/
translations.telegram.org/en/ios/general/
translations.telegram.org/en/ios/groups_and_channels/
translations.telegram.org/en/ios/login/
translations.telegram.org/en/ios/passport/
translations.telegram.org/en/ios/private_chats/
translations.telegram.org/en/ios/profile/
translations.telegram.org/en/ios/settings/
translations.telegram.org/en/ios/unsorted/
translations.telegram.org/en/macos/
translations.telegram.org/en/macos/bots_and_payments/
translations.telegram.org/en/macos/camera_and_media/
translations.telegram.org/en/macos/chat_list/
translations.telegram.org/en/macos/general/
translations.telegram.org/en/macos/groups_and_channels/
translations.telegram.org/en/macos/login/
translations.telegram.org/en/macos/passport/
translations.telegram.org/en/macos/private_chats/
translations.telegram.org/en/macos/profile/
translations.telegram.org/en/macos/settings/
translations.telegram.org/en/macos/unsorted/
translations.telegram.org/en/tdesktop/
translations.telegram.org/en/tdesktop/bots_and_payments/
translations.telegram.org/en/tdesktop/camera_and_media/
translations.telegram.org/en/tdesktop/chat_list/
translations.telegram.org/en/tdesktop/general/
translations.telegram.org/en/tdesktop/groups_and_channels/
translations.telegram.org/en/tdesktop/login/
translations.telegram.org/en/tdesktop/passport/
translations.telegram.org/en/tdesktop/private_chats/
translations.telegram.org/en/tdesktop/profile/
translations.telegram.org/en/tdesktop/settings/
translations.telegram.org/en/tdesktop/unsorted/