mirror of
https://github.com/MarshalX/telegram-crawler.git
synced 2024-11-26 01:07:37 +01:00
separate web and translations to web_tr
This commit is contained in:
parent
d0c78eaff4
commit
35c5ca8773
7 changed files with 135 additions and 87 deletions
21
.github/workflows/make_files_tree.yml
vendored
21
.github/workflows/make_files_tree.yml
vendored
|
@ -26,6 +26,9 @@ jobs:
|
|||
- mode: web_res
|
||||
os: ubuntu-20.04
|
||||
|
||||
- mode: web_tr
|
||||
os: ubuntu-20.04
|
||||
|
||||
- mode: server
|
||||
os: ubuntu-20.04
|
||||
|
||||
|
@ -68,6 +71,7 @@ jobs:
|
|||
git pull
|
||||
|
||||
mv data/web_res data_ci/web_res
|
||||
mv data/web_tr data_ci/web_tr
|
||||
mv data/client data_ci/client
|
||||
mv data/server data_ci/server
|
||||
|
||||
|
@ -81,12 +85,27 @@ jobs:
|
|||
git pull
|
||||
|
||||
mv data/web data_ci/web
|
||||
mv data/web_tr data_ci/web_tr
|
||||
mv data/client data_ci/client
|
||||
mv data/server data_ci/server
|
||||
|
||||
rm -rf data
|
||||
mv data_ci data
|
||||
|
||||
- name: Prepare data.
|
||||
if: matrix.mode == 'web_tr'
|
||||
run: |
|
||||
git checkout data
|
||||
git pull
|
||||
|
||||
mv data/web data_ci/web
|
||||
mv data/web_res data_ci/web_res
|
||||
mv data/server data_ci/server
|
||||
mv data/client data_ci/client
|
||||
|
||||
rm -rf data
|
||||
mv data_ci data
|
||||
|
||||
- name: Prepare data.
|
||||
if: matrix.mode == 'server'
|
||||
run: |
|
||||
|
@ -95,6 +114,7 @@ jobs:
|
|||
|
||||
mv data/web data_ci/web
|
||||
mv data/web_res data_ci/web_res
|
||||
mv data/web_tr data_ci/web_tr
|
||||
mv data/client data_ci/client
|
||||
|
||||
rm -rf data
|
||||
|
@ -108,6 +128,7 @@ jobs:
|
|||
|
||||
mv data/web data_ci/web
|
||||
mv data/web_res data_ci/web_res
|
||||
mv data/web_tr data_ci/web_tr
|
||||
mv data/server data_ci/server
|
||||
|
||||
rm -rf data
|
||||
|
|
|
@ -34,7 +34,8 @@ jobs:
|
|||
- name: Generate/update file with links.
|
||||
env:
|
||||
OUTPUT_FILENAME: "tracked_links_ci.txt"
|
||||
COMPARE_OUTPUT_WITH_FILENAME: "tracked_links.txt"
|
||||
OUTPUT_RESOURCES_FILENAME: "tracked_res_links_ci.txt"
|
||||
OUTPUT_TRANSLATIONS_FILENAME: "tracked_tr_links_ci.txt"
|
||||
run: |
|
||||
python make_tracked_links_list.py
|
||||
|
||||
|
@ -43,6 +44,8 @@ jobs:
|
|||
git pull
|
||||
|
||||
mv tracked_links_ci.txt tracked_links.txt
|
||||
mv tracked_res_links_ci.txt tracked_res_links.txt
|
||||
mv tracked_tr_links_ci.txt tracked_tr_links.txt
|
||||
|
||||
git config --global user.email "github-action@users.noreply.github.com"
|
||||
git config --global user.name "GitHub Action"
|
||||
|
|
|
@ -41,10 +41,11 @@ STATUS_TO_EMOJI = {
|
|||
}
|
||||
|
||||
AVAILABLE_HASHTAGS = {
|
||||
'web_res', 'web', 'server', 'test_server', 'client', 'ios', 'macos', 'android', 'translations'
|
||||
'web_tr', 'web_res', 'web', 'server', 'test_server', 'client', 'ios', 'macos', 'android'
|
||||
}
|
||||
HASHTAGS_PATTERNS = {
|
||||
# regex will be more flexible. for example, in issue with double hashtag '#web #web_res' when data/res not changed
|
||||
'web_tr': os.path.join(ROOT_TREE_DIR, 'web_tr'),
|
||||
'web_res': os.path.join(ROOT_TREE_DIR, 'web_res'),
|
||||
'web': os.path.join(ROOT_TREE_DIR, 'web'),
|
||||
'server': os.path.join(ROOT_TREE_DIR, 'server'),
|
||||
|
@ -53,10 +54,10 @@ HASHTAGS_PATTERNS = {
|
|||
'ios': os.path.join(ROOT_TREE_DIR, 'client', 'ios-beta'),
|
||||
'macos': os.path.join(ROOT_TREE_DIR, 'client', 'macos-beta'),
|
||||
'android': os.path.join(ROOT_TREE_DIR, 'client', 'android-beta'),
|
||||
'translations': os.path.join(ROOT_TREE_DIR, 'web', 'translations.telegram.org'),
|
||||
}
|
||||
# order is important!
|
||||
PATHS_TO_REMOVE_FROM_ALERT = [
|
||||
os.path.join(ROOT_TREE_DIR, 'web_tr'),
|
||||
os.path.join(ROOT_TREE_DIR, 'web_res'),
|
||||
os.path.join(ROOT_TREE_DIR, 'web'),
|
||||
os.path.join(ROOT_TREE_DIR, 'server'),
|
||||
|
|
|
@ -25,11 +25,13 @@ DYNAMIC_PART_MOCK = 'telegram-crawler'
|
|||
|
||||
INPUT_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_links.txt')
|
||||
INPUT_RES_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_res_links.txt')
|
||||
INPUT_TR_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_tr_links.txt')
|
||||
OUTPUT_FOLDER = os.environ.get('OUTPUT_FOLDER', 'data/')
|
||||
OUTPUT_MTPROTO_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_MTPROTO_FOLDER', 'server/'))
|
||||
OUTPUT_SITES_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_SITES_FOLDER', 'web/'))
|
||||
OUTPUT_CLIENTS_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_CLIENTS_FOLDER', 'client/'))
|
||||
OUTPUT_RESOURCES_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_RESOURCES_FOLDER', 'web_res/'))
|
||||
OUTPUT_TRANSLATIONS_FOLDER = os.path.join(OUTPUT_FOLDER, os.environ.get('OUTPUT_RESOURCES_FOLDER', 'web_tr/'))
|
||||
|
||||
TRANSLATIONS_EN_CATEGORY_URL_REGEX = r'/en/[a-z_]+/[a-z_]+/$'
|
||||
|
||||
|
@ -514,7 +516,7 @@ class RetryError(Exception):
|
|||
...
|
||||
|
||||
|
||||
async def crawl(url: str, session: aiohttp.ClientSession, output_dir: str = OUTPUT_SITES_FOLDER):
|
||||
async def crawl(url: str, session: aiohttp.ClientSession, output_dir: str):
|
||||
while True:
|
||||
try:
|
||||
await _crawl(url, session, output_dir)
|
||||
|
@ -585,18 +587,23 @@ async def _crawl(url: str, session: aiohttp.ClientSession, output_dir: str):
|
|||
await f.write(content)
|
||||
|
||||
|
||||
async def crawl_web(session: aiohttp.ClientSession):
|
||||
with open(INPUT_FILENAME, 'r') as f:
|
||||
async def _crawl_web(session: aiohttp.ClientSession, input_filename: str, output_folder=None):
|
||||
with open(input_filename, 'r') as f:
|
||||
tracked_urls = set([l.replace('\n', '') for l in f.readlines()])
|
||||
|
||||
await asyncio.gather(*[crawl(url, session) for url in tracked_urls])
|
||||
await asyncio.gather(*[crawl(url, session, output_folder) for url in tracked_urls])
|
||||
|
||||
|
||||
async def crawl_web(session: aiohttp.ClientSession):
|
||||
await _crawl_web(session, INPUT_FILENAME, OUTPUT_SITES_FOLDER)
|
||||
|
||||
|
||||
async def crawl_web_res(session: aiohttp.ClientSession):
|
||||
with open(INPUT_RES_FILENAME, 'r') as f:
|
||||
tracked_urls = set([l.replace('\n', '') for l in f.readlines()])
|
||||
await _crawl_web(session, INPUT_RES_FILENAME, OUTPUT_RESOURCES_FOLDER)
|
||||
|
||||
await asyncio.gather(*[crawl(url, session, OUTPUT_RESOURCES_FOLDER) for url in tracked_urls])
|
||||
|
||||
async def crawl_web_tr(session: aiohttp.ClientSession):
|
||||
await _crawl_web(session, INPUT_TR_FILENAME, OUTPUT_TRANSLATIONS_FOLDER)
|
||||
|
||||
|
||||
async def start(mode: str):
|
||||
|
@ -604,6 +611,7 @@ async def start(mode: str):
|
|||
mode == 'all' and await asyncio.gather(
|
||||
crawl_web(session),
|
||||
crawl_web_res(session),
|
||||
crawl_web_tr(session),
|
||||
track_mtproto_configs(),
|
||||
download_telegram_android_beta_and_extract_resources(session),
|
||||
download_telegram_macos_beta_and_extract_resources(session),
|
||||
|
@ -615,6 +623,9 @@ async def start(mode: str):
|
|||
mode == 'web_res' and await asyncio.gather(
|
||||
crawl_web_res(session),
|
||||
)
|
||||
mode == 'web_tr' and await asyncio.gather(
|
||||
crawl_web_tr(session),
|
||||
)
|
||||
mode == 'server' and await asyncio.gather(
|
||||
track_mtproto_configs(),
|
||||
)
|
||||
|
|
|
@ -148,6 +148,7 @@ DOM_ATTRS = ['href', 'src']
|
|||
|
||||
OUTPUT_FILENAME = os.environ.get('OUTPUT_FILENAME', 'tracked_links.txt')
|
||||
OUTPUT_RESOURCES_FILENAME = os.environ.get('OUTPUT_RESOURCES_FILENAME', 'tracked_res_links.txt')
|
||||
OUTPUT_TRANSLATIONS_FILENAME = os.environ.get('OUTPUT_TRANSLATIONS_FILENAME', 'tracked_tr_links.txt')
|
||||
|
||||
STEL_DEV_LAYER = 190
|
||||
|
||||
|
@ -176,6 +177,7 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
VISITED_LINKS = set()
|
||||
LINKS_TO_TRACK = set()
|
||||
LINKS_TO_TRANSLATIONS = set()
|
||||
LINKS_TO_TRACKABLE_RESOURCES = set()
|
||||
|
||||
|
||||
|
@ -291,6 +293,10 @@ def _is_x_content_type(content_types_set: set[str], content_type) -> bool:
|
|||
return False
|
||||
|
||||
|
||||
def is_translation_url(url: str) -> bool:
|
||||
return 'translations.telegram.org' in url
|
||||
|
||||
|
||||
def is_textable_content_type(content_type: str) -> bool:
|
||||
textable_content_type = {
|
||||
'plain',
|
||||
|
@ -356,7 +362,10 @@ async def _crawl(url: str, session: aiohttp.ClientSession):
|
|||
return
|
||||
|
||||
if is_textable_content_type(content_type):
|
||||
LINKS_TO_TRACK.add(url)
|
||||
if is_translation_url(url):
|
||||
LINKS_TO_TRANSLATIONS.add(url)
|
||||
else:
|
||||
LINKS_TO_TRACK.add(url)
|
||||
|
||||
# raw content will be cached by aiohttp. Don't worry about it
|
||||
raw_content = await response.read()
|
||||
|
@ -380,10 +389,10 @@ async def _crawl(url: str, session: aiohttp.ClientSession):
|
|||
# telegram url can work with and without trailing slash (no redirect). P.S. not on every subdomain ;d
|
||||
# so this is a problem when we have random behavior with link will be added
|
||||
# this if resolve this issue. If available both link we prefer without trailing slash
|
||||
without_trailing_slash = url[:-1:] if url.endswith('/') else url
|
||||
if without_trailing_slash in LINKS_TO_TRACK and \
|
||||
f'{without_trailing_slash}/' in LINKS_TO_TRACK:
|
||||
LINKS_TO_TRACK.remove(f'{without_trailing_slash}/')
|
||||
for links_set in (LINKS_TO_TRACK, LINKS_TO_TRANSLATIONS, LINKS_TO_TRACKABLE_RESOURCES):
|
||||
without_trailing_slash = url[:-1:] if url.endswith('/') else url
|
||||
if without_trailing_slash in links_set and f'{without_trailing_slash}/' in links_set:
|
||||
links_set.remove(f'{without_trailing_slash}/')
|
||||
except UnicodeDecodeError:
|
||||
logger.warning(f'Codec can\'t decode bytes. So it was a tgs file or response with broken content type {url}')
|
||||
|
||||
|
@ -412,7 +421,7 @@ if __name__ == '__main__':
|
|||
with open(filename, 'r') as f:
|
||||
OLD_URL_LIST |= set([l.replace('\n', '') for l in f.readlines()])
|
||||
|
||||
CURRENT_URL_LIST = LINKS_TO_TRACK | LINKS_TO_TRACKABLE_RESOURCES
|
||||
CURRENT_URL_LIST = LINKS_TO_TRACK | LINKS_TO_TRACKABLE_RESOURCES | LINKS_TO_TRANSLATIONS
|
||||
|
||||
logger.info(f'Is equal: {OLD_URL_LIST == CURRENT_URL_LIST}')
|
||||
logger.info(f'Deleted: {OLD_URL_LIST - CURRENT_URL_LIST}')
|
||||
|
@ -425,3 +434,6 @@ if __name__ == '__main__':
|
|||
|
||||
with open(OUTPUT_RESOURCES_FILENAME, 'w') as f:
|
||||
f.write('\n'.join(sorted(LINKS_TO_TRACKABLE_RESOURCES)))
|
||||
|
||||
with open(OUTPUT_TRANSLATIONS_FILENAME, 'w') as f:
|
||||
f.write('\n'.join(sorted(LINKS_TO_TRANSLATIONS)))
|
||||
|
|
|
@ -5916,77 +5916,6 @@ themes.telegram.org/js/jquery-ui.min.js
|
|||
themes.telegram.org/js/jquery.min.js
|
||||
themes.telegram.org/js/main-aj.js
|
||||
themes.telegram.org/js/themes.js
|
||||
translations.telegram.org
|
||||
translations.telegram.org/css/billboard.css
|
||||
translations.telegram.org/css/bootstrap-extra.css
|
||||
translations.telegram.org/css/bootstrap.min.css
|
||||
translations.telegram.org/css/contest-zoo.css
|
||||
translations.telegram.org/css/health.css
|
||||
translations.telegram.org/css/jquery-ui.min.css
|
||||
translations.telegram.org/css/tchart.min.css
|
||||
translations.telegram.org/css/telegram.css
|
||||
translations.telegram.org/css/translations.css
|
||||
translations.telegram.org/en/
|
||||
translations.telegram.org/en/android/
|
||||
translations.telegram.org/en/android/bots_and_payments/
|
||||
translations.telegram.org/en/android/camera_and_media/
|
||||
translations.telegram.org/en/android/chat_list/
|
||||
translations.telegram.org/en/android/general/
|
||||
translations.telegram.org/en/android/groups_and_channels/
|
||||
translations.telegram.org/en/android/login/
|
||||
translations.telegram.org/en/android/passport/
|
||||
translations.telegram.org/en/android/private_chats/
|
||||
translations.telegram.org/en/android/profile/
|
||||
translations.telegram.org/en/android/settings/
|
||||
translations.telegram.org/en/android/unsorted/
|
||||
translations.telegram.org/en/android_x/
|
||||
translations.telegram.org/en/android_x/bots_and_payments/
|
||||
translations.telegram.org/en/android_x/camera_and_media/
|
||||
translations.telegram.org/en/android_x/chat_list/
|
||||
translations.telegram.org/en/android_x/general/
|
||||
translations.telegram.org/en/android_x/groups_and_channels/
|
||||
translations.telegram.org/en/android_x/login/
|
||||
translations.telegram.org/en/android_x/passport/
|
||||
translations.telegram.org/en/android_x/private_chats/
|
||||
translations.telegram.org/en/android_x/profile/
|
||||
translations.telegram.org/en/android_x/settings/
|
||||
translations.telegram.org/en/android_x/unsorted/
|
||||
translations.telegram.org/en/ios/
|
||||
translations.telegram.org/en/ios/bots_and_payments/
|
||||
translations.telegram.org/en/ios/camera_and_media/
|
||||
translations.telegram.org/en/ios/chat_list/
|
||||
translations.telegram.org/en/ios/general/
|
||||
translations.telegram.org/en/ios/groups_and_channels/
|
||||
translations.telegram.org/en/ios/login/
|
||||
translations.telegram.org/en/ios/passport/
|
||||
translations.telegram.org/en/ios/private_chats/
|
||||
translations.telegram.org/en/ios/profile/
|
||||
translations.telegram.org/en/ios/settings/
|
||||
translations.telegram.org/en/ios/unsorted/
|
||||
translations.telegram.org/en/macos/
|
||||
translations.telegram.org/en/macos/bots_and_payments/
|
||||
translations.telegram.org/en/macos/camera_and_media/
|
||||
translations.telegram.org/en/macos/chat_list/
|
||||
translations.telegram.org/en/macos/general/
|
||||
translations.telegram.org/en/macos/groups_and_channels/
|
||||
translations.telegram.org/en/macos/login/
|
||||
translations.telegram.org/en/macos/passport/
|
||||
translations.telegram.org/en/macos/private_chats/
|
||||
translations.telegram.org/en/macos/profile/
|
||||
translations.telegram.org/en/macos/settings/
|
||||
translations.telegram.org/en/macos/unsorted/
|
||||
translations.telegram.org/en/tdesktop/
|
||||
translations.telegram.org/en/tdesktop/bots_and_payments/
|
||||
translations.telegram.org/en/tdesktop/camera_and_media/
|
||||
translations.telegram.org/en/tdesktop/chat_list/
|
||||
translations.telegram.org/en/tdesktop/general/
|
||||
translations.telegram.org/en/tdesktop/groups_and_channels/
|
||||
translations.telegram.org/en/tdesktop/login/
|
||||
translations.telegram.org/en/tdesktop/passport/
|
||||
translations.telegram.org/en/tdesktop/private_chats/
|
||||
translations.telegram.org/en/tdesktop/profile/
|
||||
translations.telegram.org/en/tdesktop/settings/
|
||||
translations.telegram.org/en/tdesktop/unsorted/
|
||||
tsf.telegram.org
|
||||
tsf.telegram.org/auth
|
||||
tsf.telegram.org/css/billboard.css
|
||||
|
|
71
tracked_tr_links.txt
Normal file
71
tracked_tr_links.txt
Normal file
|
@ -0,0 +1,71 @@
|
|||
translations.telegram.org
|
||||
translations.telegram.org/css/billboard.css
|
||||
translations.telegram.org/css/bootstrap-extra.css
|
||||
translations.telegram.org/css/bootstrap.min.css
|
||||
translations.telegram.org/css/contest-zoo.css
|
||||
translations.telegram.org/css/health.css
|
||||
translations.telegram.org/css/jquery-ui.min.css
|
||||
translations.telegram.org/css/tchart.min.css
|
||||
translations.telegram.org/css/telegram.css
|
||||
translations.telegram.org/css/translations.css
|
||||
translations.telegram.org/en/
|
||||
translations.telegram.org/en/android/
|
||||
translations.telegram.org/en/android/bots_and_payments/
|
||||
translations.telegram.org/en/android/camera_and_media/
|
||||
translations.telegram.org/en/android/chat_list/
|
||||
translations.telegram.org/en/android/general/
|
||||
translations.telegram.org/en/android/groups_and_channels/
|
||||
translations.telegram.org/en/android/login/
|
||||
translations.telegram.org/en/android/passport/
|
||||
translations.telegram.org/en/android/private_chats/
|
||||
translations.telegram.org/en/android/profile/
|
||||
translations.telegram.org/en/android/settings/
|
||||
translations.telegram.org/en/android/unsorted/
|
||||
translations.telegram.org/en/android_x/
|
||||
translations.telegram.org/en/android_x/bots_and_payments/
|
||||
translations.telegram.org/en/android_x/camera_and_media/
|
||||
translations.telegram.org/en/android_x/chat_list/
|
||||
translations.telegram.org/en/android_x/general/
|
||||
translations.telegram.org/en/android_x/groups_and_channels/
|
||||
translations.telegram.org/en/android_x/login/
|
||||
translations.telegram.org/en/android_x/passport/
|
||||
translations.telegram.org/en/android_x/private_chats/
|
||||
translations.telegram.org/en/android_x/profile/
|
||||
translations.telegram.org/en/android_x/settings/
|
||||
translations.telegram.org/en/android_x/unsorted/
|
||||
translations.telegram.org/en/ios/
|
||||
translations.telegram.org/en/ios/bots_and_payments/
|
||||
translations.telegram.org/en/ios/camera_and_media/
|
||||
translations.telegram.org/en/ios/chat_list/
|
||||
translations.telegram.org/en/ios/general/
|
||||
translations.telegram.org/en/ios/groups_and_channels/
|
||||
translations.telegram.org/en/ios/login/
|
||||
translations.telegram.org/en/ios/passport/
|
||||
translations.telegram.org/en/ios/private_chats/
|
||||
translations.telegram.org/en/ios/profile/
|
||||
translations.telegram.org/en/ios/settings/
|
||||
translations.telegram.org/en/ios/unsorted/
|
||||
translations.telegram.org/en/macos/
|
||||
translations.telegram.org/en/macos/bots_and_payments/
|
||||
translations.telegram.org/en/macos/camera_and_media/
|
||||
translations.telegram.org/en/macos/chat_list/
|
||||
translations.telegram.org/en/macos/general/
|
||||
translations.telegram.org/en/macos/groups_and_channels/
|
||||
translations.telegram.org/en/macos/login/
|
||||
translations.telegram.org/en/macos/passport/
|
||||
translations.telegram.org/en/macos/private_chats/
|
||||
translations.telegram.org/en/macos/profile/
|
||||
translations.telegram.org/en/macos/settings/
|
||||
translations.telegram.org/en/macos/unsorted/
|
||||
translations.telegram.org/en/tdesktop/
|
||||
translations.telegram.org/en/tdesktop/bots_and_payments/
|
||||
translations.telegram.org/en/tdesktop/camera_and_media/
|
||||
translations.telegram.org/en/tdesktop/chat_list/
|
||||
translations.telegram.org/en/tdesktop/general/
|
||||
translations.telegram.org/en/tdesktop/groups_and_channels/
|
||||
translations.telegram.org/en/tdesktop/login/
|
||||
translations.telegram.org/en/tdesktop/passport/
|
||||
translations.telegram.org/en/tdesktop/private_chats/
|
||||
translations.telegram.org/en/tdesktop/profile/
|
||||
translations.telegram.org/en/tdesktop/settings/
|
||||
translations.telegram.org/en/tdesktop/unsorted/
|
Loading…
Reference in a new issue