mirror of
https://github.com/MarshalX/telegram-crawler.git
synced 2024-11-24 16:29:45 +01:00
move tracked links list to main branch;
add generation of content tree; add workflow for tree generation; add new tg hidden link.
This commit is contained in:
parent
5f29beda73
commit
6ceb897e33
5 changed files with 120 additions and 4 deletions
50
.github/workflows/make_files_tree.yml
vendored
Normal file
50
.github/workflows/make_files_tree.yml
vendored
Normal file
|
@ -0,0 +1,50 @@
|
|||
name: Fetch new content of tracked links to files
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '0 * * * *'
|
||||
push:
|
||||
branches:
|
||||
- main-test
|
||||
|
||||
jobs:
|
||||
make_tracked_links_file:
|
||||
name: Make files tree
|
||||
runs-on: macos-10.15
|
||||
|
||||
steps:
|
||||
|
||||
- name: Clone.
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- name: Setup Python.
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.9
|
||||
|
||||
- name: Install dependencies.
|
||||
run: |
|
||||
pip install -r requirements.txt
|
||||
|
||||
- name: Generate files tree.
|
||||
env:
|
||||
OUTPUT_FOLDER: "data_ci/"
|
||||
run: |
|
||||
python make_files_tree.py
|
||||
|
||||
- name: Commit and push changes.
|
||||
run: |
|
||||
git pull
|
||||
git checkout data
|
||||
|
||||
rm -rf data
|
||||
mv data_ci data
|
||||
|
||||
git config --global user.email "github-action@users.noreply.github.com"
|
||||
git config --global user.name "GitHub Action"
|
||||
|
||||
git add .
|
||||
git commit -m "Update tracked links"
|
||||
git push
|
|
@ -37,7 +37,6 @@ jobs:
|
|||
- name: Commit and push changes.
|
||||
run: |
|
||||
git pull
|
||||
git checkout data
|
||||
|
||||
mv tracked_links_ci.txt tracked_links.txt
|
||||
|
||||
|
|
61
make_files_tree.py
Normal file
61
make_files_tree.py
Normal file
|
@ -0,0 +1,61 @@
|
|||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
from asyncio.exceptions import TimeoutError
|
||||
from string import punctuation, whitespace
|
||||
from time import time
|
||||
|
||||
import aiofiles
|
||||
import aiohttp
|
||||
from aiohttp import ClientConnectorError
|
||||
|
||||
PROTOCOL = 'https://'
|
||||
ILLEGAL_PATH_CHARS = punctuation.replace('.', '') + whitespace
|
||||
|
||||
INPUT_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_links.txt')
|
||||
OUTPUT_FOLDER = os.environ.get('OUTPUT_FOLDER', 'data/')
|
||||
|
||||
# unsecure but so simple
|
||||
CONNECTOR = aiohttp.TCPConnector(ssl=False)
|
||||
TIMEOUT = aiohttp.ClientTimeout(total=30)
|
||||
|
||||
logging.basicConfig(format='%(message)s', level=logging.DEBUG)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def crawl(url: str, session: aiohttp.ClientSession):
|
||||
try:
|
||||
logger.info(f'Process {url}')
|
||||
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False) as response:
|
||||
if response.status != 200:
|
||||
return
|
||||
|
||||
# bypass external slashes and so on
|
||||
url_parts = [p for p in url.split('/') if p not in ILLEGAL_PATH_CHARS]
|
||||
# handle pure domains and html pages without ext in url
|
||||
ext = '.html' if '.' not in url_parts[-1] or len(url_parts) == 1 else ''
|
||||
|
||||
filename = OUTPUT_FOLDER + '/'.join(url_parts) + ext
|
||||
|
||||
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
||||
async with aiofiles.open(filename, 'w') as f:
|
||||
logger.info(f'Write to {filename}')
|
||||
content = await response.text()
|
||||
await f.write(content)
|
||||
except (TimeoutError, ClientConnectorError):
|
||||
await asyncio.gather(crawl(url, session))
|
||||
|
||||
|
||||
async def start(url_list: set[str]):
|
||||
async with aiohttp.ClientSession(connector=CONNECTOR) as session:
|
||||
await asyncio.gather(*[crawl(url, session) for url in url_list])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
with open(INPUT_FILENAME, 'r') as f:
|
||||
tracked_urls = set([l.replace('\n', '') for l in f.readlines()])
|
||||
|
||||
logger.info(f'Start crawling content of {len(tracked_urls)} tracked urls...')
|
||||
start_time = time()
|
||||
asyncio.get_event_loop().run_until_complete(start(tracked_urls))
|
||||
logger.info(f'Stop crawling content. {time() - start_time} sec.')
|
|
@ -19,6 +19,7 @@ HIDDEN_URLS = {
|
|||
'telegram.org/privacy/gmailbot',
|
||||
'telegram.org/tos',
|
||||
'telegram.org/tour',
|
||||
'telegram.org/evolution',
|
||||
|
||||
'desktop.telegram.org/changelog',
|
||||
}
|
||||
|
@ -171,6 +172,10 @@ def cleanup_links(links: set[str]) -> set[str]:
|
|||
|
||||
|
||||
async def crawl(url: str, session: aiohttp.ClientSession):
|
||||
# todo
|
||||
if url.endswith('.'):
|
||||
return
|
||||
|
||||
without_trailing_slash = url[:-1:] if url.endswith('/') else url
|
||||
if without_trailing_slash in VISITED_LINKS:
|
||||
return
|
||||
|
@ -228,10 +233,10 @@ async def start(url_list: set[str]):
|
|||
if __name__ == '__main__':
|
||||
HIDDEN_URLS.add(BASE_URL)
|
||||
|
||||
logger.info('Start crawling...')
|
||||
logger.info('Start crawling links...')
|
||||
start_time = time()
|
||||
asyncio.get_event_loop().run_until_complete(start(HIDDEN_URLS))
|
||||
logger.info(f'Stop crawling. {time() - start_time} sec.')
|
||||
logger.info(f'Stop crawling links. {time() - start_time} sec.')
|
||||
|
||||
with open(OUTPUT_FILENAME, 'w') as f:
|
||||
f.write('\n'.join(sorted(LINKS_TO_TRACK)))
|
||||
|
|
|
@ -1 +1,2 @@
|
|||
aiohttp==3.7.4.post0
|
||||
aiofiles==0.6.0
|
Loading…
Reference in a new issue