move tracked links list to main branch;

add generation of content tree;
add workflow for tree generation;
add new tg hidden link.
This commit is contained in:
Il'ya (Marshal) 2021-04-24 14:19:01 +02:00
parent 5f29beda73
commit 6ceb897e33
5 changed files with 120 additions and 4 deletions

50
.github/workflows/make_files_tree.yml vendored Normal file
View file

@ -0,0 +1,50 @@
name: Fetch new content of tracked links to files
on:
schedule:
- cron: '0 * * * *'
push:
branches:
- main-test
jobs:
make_tracked_links_file:
name: Make files tree
runs-on: macos-10.15
steps:
- name: Clone.
uses: actions/checkout@v2
with:
submodules: recursive
- name: Setup Python.
uses: actions/setup-python@v2
with:
python-version: 3.9
- name: Install dependencies.
run: |
pip install -r requirements.txt
- name: Generate files tree.
env:
OUTPUT_FOLDER: "data_ci/"
run: |
python make_files_tree.py
- name: Commit and push changes.
run: |
git pull
git checkout data
rm -rf data
mv data_ci data
git config --global user.email "github-action@users.noreply.github.com"
git config --global user.name "GitHub Action"
git add .
git commit -m "Update tracked links"
git push

View file

@ -37,7 +37,6 @@ jobs:
- name: Commit and push changes. - name: Commit and push changes.
run: | run: |
git pull git pull
git checkout data
mv tracked_links_ci.txt tracked_links.txt mv tracked_links_ci.txt tracked_links.txt

61
make_files_tree.py Normal file
View file

@ -0,0 +1,61 @@
import asyncio
import logging
import os
from asyncio.exceptions import TimeoutError
from string import punctuation, whitespace
from time import time
import aiofiles
import aiohttp
from aiohttp import ClientConnectorError
PROTOCOL = 'https://'
ILLEGAL_PATH_CHARS = punctuation.replace('.', '') + whitespace
INPUT_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_links.txt')
OUTPUT_FOLDER = os.environ.get('OUTPUT_FOLDER', 'data/')
# unsecure but so simple
CONNECTOR = aiohttp.TCPConnector(ssl=False)
TIMEOUT = aiohttp.ClientTimeout(total=30)
logging.basicConfig(format='%(message)s', level=logging.DEBUG)
logger = logging.getLogger(__name__)
async def crawl(url: str, session: aiohttp.ClientSession):
try:
logger.info(f'Process {url}')
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False) as response:
if response.status != 200:
return
# bypass external slashes and so on
url_parts = [p for p in url.split('/') if p not in ILLEGAL_PATH_CHARS]
# handle pure domains and html pages without ext in url
ext = '.html' if '.' not in url_parts[-1] or len(url_parts) == 1 else ''
filename = OUTPUT_FOLDER + '/'.join(url_parts) + ext
os.makedirs(os.path.dirname(filename), exist_ok=True)
async with aiofiles.open(filename, 'w') as f:
logger.info(f'Write to {filename}')
content = await response.text()
await f.write(content)
except (TimeoutError, ClientConnectorError):
await asyncio.gather(crawl(url, session))
async def start(url_list: set[str]):
async with aiohttp.ClientSession(connector=CONNECTOR) as session:
await asyncio.gather(*[crawl(url, session) for url in url_list])
if __name__ == '__main__':
with open(INPUT_FILENAME, 'r') as f:
tracked_urls = set([l.replace('\n', '') for l in f.readlines()])
logger.info(f'Start crawling content of {len(tracked_urls)} tracked urls...')
start_time = time()
asyncio.get_event_loop().run_until_complete(start(tracked_urls))
logger.info(f'Stop crawling content. {time() - start_time} sec.')

View file

@ -19,6 +19,7 @@ HIDDEN_URLS = {
'telegram.org/privacy/gmailbot', 'telegram.org/privacy/gmailbot',
'telegram.org/tos', 'telegram.org/tos',
'telegram.org/tour', 'telegram.org/tour',
'telegram.org/evolution',
'desktop.telegram.org/changelog', 'desktop.telegram.org/changelog',
} }
@ -171,6 +172,10 @@ def cleanup_links(links: set[str]) -> set[str]:
async def crawl(url: str, session: aiohttp.ClientSession): async def crawl(url: str, session: aiohttp.ClientSession):
# todo
if url.endswith('.'):
return
without_trailing_slash = url[:-1:] if url.endswith('/') else url without_trailing_slash = url[:-1:] if url.endswith('/') else url
if without_trailing_slash in VISITED_LINKS: if without_trailing_slash in VISITED_LINKS:
return return
@ -228,10 +233,10 @@ async def start(url_list: set[str]):
if __name__ == '__main__': if __name__ == '__main__':
HIDDEN_URLS.add(BASE_URL) HIDDEN_URLS.add(BASE_URL)
logger.info('Start crawling...') logger.info('Start crawling links...')
start_time = time() start_time = time()
asyncio.get_event_loop().run_until_complete(start(HIDDEN_URLS)) asyncio.get_event_loop().run_until_complete(start(HIDDEN_URLS))
logger.info(f'Stop crawling. {time() - start_time} sec.') logger.info(f'Stop crawling links. {time() - start_time} sec.')
with open(OUTPUT_FILENAME, 'w') as f: with open(OUTPUT_FILENAME, 'w') as f:
f.write('\n'.join(sorted(LINKS_TO_TRACK))) f.write('\n'.join(sorted(LINKS_TO_TRACK)))

View file

@ -1 +1,2 @@
aiohttp==3.7.4.post0 aiohttp==3.7.4.post0
aiofiles==0.6.0