From 6ceb897e33e5c8a81cf6978f315b30c60d9341d0 Mon Sep 17 00:00:00 2001 From: "Il'ya (Marshal)" Date: Sat, 24 Apr 2021 14:19:01 +0200 Subject: [PATCH] move tracked links list to main branch; add generation of content tree; add workflow for tree generation; add new tg hidden link. --- .github/workflows/make_files_tree.yml | 50 +++++++++++++++ .github/workflows/make_tracked_links_list.yml | 1 - make_files_tree.py | 61 +++++++++++++++++++ make_tracked_links_list.py | 9 ++- requirements.txt | 3 +- 5 files changed, 120 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/make_files_tree.yml create mode 100644 make_files_tree.py diff --git a/.github/workflows/make_files_tree.yml b/.github/workflows/make_files_tree.yml new file mode 100644 index 0000000000..cf2cb6c640 --- /dev/null +++ b/.github/workflows/make_files_tree.yml @@ -0,0 +1,50 @@ +name: Fetch new content of tracked links to files + +on: + schedule: + - cron: '0 * * * *' + push: + branches: + - main-test + +jobs: + make_tracked_links_file: + name: Make files tree + runs-on: macos-10.15 + + steps: + + - name: Clone. + uses: actions/checkout@v2 + with: + submodules: recursive + + - name: Setup Python. + uses: actions/setup-python@v2 + with: + python-version: 3.9 + + - name: Install dependencies. + run: | + pip install -r requirements.txt + + - name: Generate files tree. + env: + OUTPUT_FOLDER: "data_ci/" + run: | + python make_files_tree.py + + - name: Commit and push changes. + run: | + git pull + git checkout data + + rm -rf data + mv data_ci data + + git config --global user.email "github-action@users.noreply.github.com" + git config --global user.name "GitHub Action" + + git add . + git commit -m "Update tracked links" + git push diff --git a/.github/workflows/make_tracked_links_list.yml b/.github/workflows/make_tracked_links_list.yml index 30f138126a..b82bb3ee0e 100644 --- a/.github/workflows/make_tracked_links_list.yml +++ b/.github/workflows/make_tracked_links_list.yml @@ -37,7 +37,6 @@ jobs: - name: Commit and push changes. run: | git pull - git checkout data mv tracked_links_ci.txt tracked_links.txt diff --git a/make_files_tree.py b/make_files_tree.py new file mode 100644 index 0000000000..408ecb3902 --- /dev/null +++ b/make_files_tree.py @@ -0,0 +1,61 @@ +import asyncio +import logging +import os +from asyncio.exceptions import TimeoutError +from string import punctuation, whitespace +from time import time + +import aiofiles +import aiohttp +from aiohttp import ClientConnectorError + +PROTOCOL = 'https://' +ILLEGAL_PATH_CHARS = punctuation.replace('.', '') + whitespace + +INPUT_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_links.txt') +OUTPUT_FOLDER = os.environ.get('OUTPUT_FOLDER', 'data/') + +# unsecure but so simple +CONNECTOR = aiohttp.TCPConnector(ssl=False) +TIMEOUT = aiohttp.ClientTimeout(total=30) + +logging.basicConfig(format='%(message)s', level=logging.DEBUG) +logger = logging.getLogger(__name__) + + +async def crawl(url: str, session: aiohttp.ClientSession): + try: + logger.info(f'Process {url}') + async with session.get(f'{PROTOCOL}{url}', allow_redirects=False) as response: + if response.status != 200: + return + + # bypass external slashes and so on + url_parts = [p for p in url.split('/') if p not in ILLEGAL_PATH_CHARS] + # handle pure domains and html pages without ext in url + ext = '.html' if '.' not in url_parts[-1] or len(url_parts) == 1 else '' + + filename = OUTPUT_FOLDER + '/'.join(url_parts) + ext + + os.makedirs(os.path.dirname(filename), exist_ok=True) + async with aiofiles.open(filename, 'w') as f: + logger.info(f'Write to {filename}') + content = await response.text() + await f.write(content) + except (TimeoutError, ClientConnectorError): + await asyncio.gather(crawl(url, session)) + + +async def start(url_list: set[str]): + async with aiohttp.ClientSession(connector=CONNECTOR) as session: + await asyncio.gather(*[crawl(url, session) for url in url_list]) + + +if __name__ == '__main__': + with open(INPUT_FILENAME, 'r') as f: + tracked_urls = set([l.replace('\n', '') for l in f.readlines()]) + + logger.info(f'Start crawling content of {len(tracked_urls)} tracked urls...') + start_time = time() + asyncio.get_event_loop().run_until_complete(start(tracked_urls)) + logger.info(f'Stop crawling content. {time() - start_time} sec.') diff --git a/make_tracked_links_list.py b/make_tracked_links_list.py index d9943187cd..8d2ebf23d9 100644 --- a/make_tracked_links_list.py +++ b/make_tracked_links_list.py @@ -19,6 +19,7 @@ HIDDEN_URLS = { 'telegram.org/privacy/gmailbot', 'telegram.org/tos', 'telegram.org/tour', + 'telegram.org/evolution', 'desktop.telegram.org/changelog', } @@ -171,6 +172,10 @@ def cleanup_links(links: set[str]) -> set[str]: async def crawl(url: str, session: aiohttp.ClientSession): + # todo + if url.endswith('.'): + return + without_trailing_slash = url[:-1:] if url.endswith('/') else url if without_trailing_slash in VISITED_LINKS: return @@ -228,10 +233,10 @@ async def start(url_list: set[str]): if __name__ == '__main__': HIDDEN_URLS.add(BASE_URL) - logger.info('Start crawling...') + logger.info('Start crawling links...') start_time = time() asyncio.get_event_loop().run_until_complete(start(HIDDEN_URLS)) - logger.info(f'Stop crawling. {time() - start_time} sec.') + logger.info(f'Stop crawling links. {time() - start_time} sec.') with open(OUTPUT_FILENAME, 'w') as f: f.write('\n'.join(sorted(LINKS_TO_TRACK))) diff --git a/requirements.txt b/requirements.txt index af91ed2676..38657a34cc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ -aiohttp==3.7.4.post0 \ No newline at end of file +aiohttp==3.7.4.post0 +aiofiles==0.6.0 \ No newline at end of file