move tracked links list to main branch;

add generation of content tree; add workflow for tree generation; add new tg hidden link.
2025-03-16 13:45:48 +01:00 · 2021-04-24 14:19:01 +02:00 · 2021-04-24 14:19:01 +02:00 · 6ceb897e33
commit 6ceb897e33
parent 5f29beda73
5 changed files with 120 additions and 4 deletions
--- a/.github/workflows/make_files_tree.yml
+++ b/.github/workflows/make_files_tree.yml
@ -0,0 +1,50 @@
+name: Fetch new content of tracked links to files
+
+on:
+  schedule:
+    - cron: '0 * * * *'
+  push:
+    branches:
+      - main-test
+
+jobs:
+  make_tracked_links_file:
+    name: Make files tree
+    runs-on: macos-10.15
+
+    steps:
+
+      - name: Clone.
+        uses: actions/checkout@v2
+        with:
+          submodules: recursive
+
+      - name: Setup Python.
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.9
+
+      - name: Install dependencies.
+        run: |
+          pip install -r requirements.txt
+
+      - name: Generate files tree.
+        env:
+          OUTPUT_FOLDER: "data_ci/"
+        run: |
+          python make_files_tree.py
+
+      - name: Commit and push changes.
+        run: |
+          git pull
+          git checkout data
+
+          rm -rf data
+          mv data_ci data
+
+          git config --global user.email "github-action@users.noreply.github.com"
+          git config --global user.name "GitHub Action"
+
+          git add .
+          git commit -m "Update tracked links"
+          git push
--- a/.github/workflows/make_tracked_links_list.yml
+++ b/.github/workflows/make_tracked_links_list.yml
@ -37,7 +37,6 @@ jobs:
      - name: Commit and push changes.
        run: |
          git pull
-          git checkout data

          mv tracked_links_ci.txt tracked_links.txt

--- a/make_files_tree.py
+++ b/make_files_tree.py
@ -0,0 +1,61 @@
+import asyncio
+import logging
+import os
+from asyncio.exceptions import TimeoutError
+from string import punctuation, whitespace
+from time import time
+
+import aiofiles
+import aiohttp
+from aiohttp import ClientConnectorError
+
+PROTOCOL = 'https://'
+ILLEGAL_PATH_CHARS = punctuation.replace('.', '') + whitespace
+
+INPUT_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_links.txt')
+OUTPUT_FOLDER = os.environ.get('OUTPUT_FOLDER', 'data/')
+
+# unsecure but so simple
+CONNECTOR = aiohttp.TCPConnector(ssl=False)
+TIMEOUT = aiohttp.ClientTimeout(total=30)
+
+logging.basicConfig(format='%(message)s', level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+
+
+async def crawl(url: str, session: aiohttp.ClientSession):
+    try:
+        logger.info(f'Process {url}')
+        async with session.get(f'{PROTOCOL}{url}', allow_redirects=False) as response:
+            if response.status != 200:
+                return
+
+            # bypass external slashes and so on
+            url_parts = [p for p in url.split('/') if p not in ILLEGAL_PATH_CHARS]
+            # handle pure domains and html pages without ext in url
+            ext = '.html' if '.' not in url_parts[-1] or len(url_parts) == 1 else ''
+
+            filename = OUTPUT_FOLDER + '/'.join(url_parts) + ext
+
+            os.makedirs(os.path.dirname(filename), exist_ok=True)
+            async with aiofiles.open(filename, 'w') as f:
+                logger.info(f'Write to {filename}')
+                content = await response.text()
+                await f.write(content)
+    except (TimeoutError, ClientConnectorError):
+        await asyncio.gather(crawl(url, session))
+
+
+async def start(url_list: set[str]):
+    async with aiohttp.ClientSession(connector=CONNECTOR) as session:
+        await asyncio.gather(*[crawl(url, session) for url in url_list])
+
+
+if __name__ == '__main__':
+    with open(INPUT_FILENAME, 'r') as f:
+        tracked_urls = set([l.replace('\n', '') for l in f.readlines()])
+
+    logger.info(f'Start crawling content of {len(tracked_urls)} tracked urls...')
+    start_time = time()
+    asyncio.get_event_loop().run_until_complete(start(tracked_urls))
+    logger.info(f'Stop crawling content. {time() - start_time} sec.')
--- a/make_tracked_links_list.py
+++ b/make_tracked_links_list.py
@ -19,6 +19,7 @@ HIDDEN_URLS = {
    'telegram.org/privacy/gmailbot',
    'telegram.org/tos',
    'telegram.org/tour',
+    'telegram.org/evolution',

    'desktop.telegram.org/changelog',
 }
@ -171,6 +172,10 @@ def cleanup_links(links: set[str]) -> set[str]:


 async def crawl(url: str, session: aiohttp.ClientSession):
+    # todo
+    if url.endswith('.'):
+        return
+
    without_trailing_slash = url[:-1:] if url.endswith('/') else url
    if without_trailing_slash in VISITED_LINKS:
        return
@ -228,10 +233,10 @@ async def start(url_list: set[str]):
 if __name__ == '__main__':
    HIDDEN_URLS.add(BASE_URL)

-    logger.info('Start crawling...')
+    logger.info('Start crawling links...')
    start_time = time()
    asyncio.get_event_loop().run_until_complete(start(HIDDEN_URLS))
-    logger.info(f'Stop crawling. {time() - start_time} sec.')
+    logger.info(f'Stop crawling links. {time() - start_time} sec.')

    with open(OUTPUT_FILENAME, 'w') as f:
        f.write('\n'.join(sorted(LINKS_TO_TRACK)))
--- a/requirements.txt
+++ b/requirements.txt
@ -1 +1,2 @@
-aiohttp==3.7.4.post0
+aiohttp==3.7.4.post0
+aiofiles==0.6.0