telegram-crawler/make_files_tree.py
Il'ya (Marshal) 6ceb897e33 move tracked links list to main branch;
add generation of content tree;
add workflow for tree generation;
add new tg hidden link.
2021-04-24 14:19:01 +02:00

61 lines
2.1 KiB
Python

import asyncio
import logging
import os
from asyncio.exceptions import TimeoutError
from string import punctuation, whitespace
from time import time
import aiofiles
import aiohttp
from aiohttp import ClientConnectorError
PROTOCOL = 'https://'
ILLEGAL_PATH_CHARS = punctuation.replace('.', '') + whitespace
INPUT_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_links.txt')
OUTPUT_FOLDER = os.environ.get('OUTPUT_FOLDER', 'data/')
# unsecure but so simple
CONNECTOR = aiohttp.TCPConnector(ssl=False)
TIMEOUT = aiohttp.ClientTimeout(total=30)
logging.basicConfig(format='%(message)s', level=logging.DEBUG)
logger = logging.getLogger(__name__)
async def crawl(url: str, session: aiohttp.ClientSession):
try:
logger.info(f'Process {url}')
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False) as response:
if response.status != 200:
return
# bypass external slashes and so on
url_parts = [p for p in url.split('/') if p not in ILLEGAL_PATH_CHARS]
# handle pure domains and html pages without ext in url
ext = '.html' if '.' not in url_parts[-1] or len(url_parts) == 1 else ''
filename = OUTPUT_FOLDER + '/'.join(url_parts) + ext
os.makedirs(os.path.dirname(filename), exist_ok=True)
async with aiofiles.open(filename, 'w') as f:
logger.info(f'Write to {filename}')
content = await response.text()
await f.write(content)
except (TimeoutError, ClientConnectorError):
await asyncio.gather(crawl(url, session))
async def start(url_list: set[str]):
async with aiohttp.ClientSession(connector=CONNECTOR) as session:
await asyncio.gather(*[crawl(url, session) for url in url_list])
if __name__ == '__main__':
with open(INPUT_FILENAME, 'r') as f:
tracked_urls = set([l.replace('\n', '') for l in f.readlines()])
logger.info(f'Start crawling content of {len(tracked_urls)} tracked urls...')
start_time = time()
asyncio.get_event_loop().run_until_complete(start(tracked_urls))
logger.info(f'Stop crawling content. {time() - start_time} sec.')