move tracked links list to main branch;

add generation of content tree; add workflow for tree generation; add new tg hidden link.
2024-11-21 23:06:40 +01:00 · 2021-04-24 14:19:01 +02:00 · 2021-04-24 14:19:01 +02:00 · 6ceb897e33
commit 6ceb897e33
parent 5f29beda73
5 changed files with 120 additions and 4 deletions
--- a/.github/workflows/make_files_tree.yml
+++ b/.github/workflows/make_files_tree.yml
@ -0,0 +1,50 @@
 name: Fetch new content of tracked links to files
 on:
  schedule:
    - cron: '0 * * * *'
  push:
    branches:
      - main-test
 jobs:
  make_tracked_links_file:
    name: Make files tree
    runs-on: macos-10.15
    steps:
      - name: Clone.
        uses: actions/checkout@v2
        with:
          submodules: recursive
      - name: Setup Python.
        uses: actions/setup-python@v2
        with:
          python-version: 3.9
      - name: Install dependencies.
        run: |
          pip install -r requirements.txt
      - name: Generate files tree.
        env:
          OUTPUT_FOLDER: "data_ci/"
        run: |
          python make_files_tree.py
      - name: Commit and push changes.
        run: |
          git pull
          git checkout data
          rm -rf data
          mv data_ci data
          git config --global user.email "github-action@users.noreply.github.com"
          git config --global user.name "GitHub Action"
          git add .
          git commit -m "Update tracked links"
          git push
--- a/.github/workflows/make_tracked_links_list.yml
+++ b/.github/workflows/make_tracked_links_list.yml
@ -37,7 +37,6 @@ jobs:
      - name: Commit and push changes.
        run: |
          git pull
          git checkout data
          mv tracked_links_ci.txt tracked_links.txt
--- a/make_files_tree.py
+++ b/make_files_tree.py
@ -0,0 +1,61 @@
 import asyncio
 import logging
 import os
 from asyncio.exceptions import TimeoutError
 from string import punctuation, whitespace
 from time import time
 import aiofiles
 import aiohttp
 from aiohttp import ClientConnectorError
 PROTOCOL = 'https://'
 ILLEGAL_PATH_CHARS = punctuation.replace('.', '') + whitespace
 INPUT_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_links.txt')
 OUTPUT_FOLDER = os.environ.get('OUTPUT_FOLDER', 'data/')
 # unsecure but so simple
 CONNECTOR = aiohttp.TCPConnector(ssl=False)
 TIMEOUT = aiohttp.ClientTimeout(total=30)
 logging.basicConfig(format='%(message)s', level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 async def crawl(url: str, session: aiohttp.ClientSession):
    try:
        logger.info(f'Process {url}')
        async with session.get(f'{PROTOCOL}{url}', allow_redirects=False) as response:
            if response.status != 200:
                return
            # bypass external slashes and so on
            url_parts = [p for p in url.split('/') if p not in ILLEGAL_PATH_CHARS]
            # handle pure domains and html pages without ext in url
            ext = '.html' if '.' not in url_parts[-1] or len(url_parts) == 1 else ''
            filename = OUTPUT_FOLDER + '/'.join(url_parts) + ext
            os.makedirs(os.path.dirname(filename), exist_ok=True)
            async with aiofiles.open(filename, 'w') as f:
                logger.info(f'Write to {filename}')
                content = await response.text()
                await f.write(content)
    except (TimeoutError, ClientConnectorError):
        await asyncio.gather(crawl(url, session))
 async def start(url_list: set[str]):
    async with aiohttp.ClientSession(connector=CONNECTOR) as session:
        await asyncio.gather(*[crawl(url, session) for url in url_list])
 if __name__ == '__main__':
    with open(INPUT_FILENAME, 'r') as f:
        tracked_urls = set([l.replace('\n', '') for l in f.readlines()])
    logger.info(f'Start crawling content of {len(tracked_urls)} tracked urls...')
    start_time = time()
    asyncio.get_event_loop().run_until_complete(start(tracked_urls))
    logger.info(f'Stop crawling content. {time() - start_time} sec.')
--- a/make_tracked_links_list.py
+++ b/make_tracked_links_list.py
@ -19,6 +19,7 @@ HIDDEN_URLS = {
    'telegram.org/privacy/gmailbot',
    'telegram.org/tos',
    'telegram.org/tour',
    'telegram.org/evolution',
    'desktop.telegram.org/changelog',
 }
@ -171,6 +172,10 @@ def cleanup_links(links: set[str]) -> set[str]:
 async def crawl(url: str, session: aiohttp.ClientSession):
    # todo
    if url.endswith('.'):
        return
    without_trailing_slash = url[:-1:] if url.endswith('/') else url
    if without_trailing_slash in VISITED_LINKS:
        return
@ -228,10 +233,10 @@ async def start(url_list: set[str]):
 if __name__ == '__main__':
    HIDDEN_URLS.add(BASE_URL)
-    logger.info('Start crawling...')
+    logger.info('Start crawling links...')
    start_time = time()
    asyncio.get_event_loop().run_until_complete(start(HIDDEN_URLS))
-    logger.info(f'Stop crawling. {time() - start_time} sec.')
+    logger.info(f'Stop crawling links. {time() - start_time} sec.')
    with open(OUTPUT_FILENAME, 'w') as f:
        f.write('\n'.join(sorted(LINKS_TO_TRACK)))
--- a/requirements.txt
+++ b/requirements.txt
@ -1 +1,2 @@
-aiohttp==3.7.4.post0
+aiohttp==3.7.4.post0
 aiofiles==0.6.0
`@ -1 +1,2 @@`
	`aiohttp==3.7.4.post0`	`aiohttp==3.7.4.post0`
		`aiofiles==0.6.0`