mirror of
https://github.com/MarshalX/telegram-crawler.git
synced 2024-11-21 23:06:40 +01:00
move tracked links list to main branch;
add generation of content tree; add workflow for tree generation; add new tg hidden link.
This commit is contained in:
parent
5f29beda73
commit
6ceb897e33
5 changed files with 120 additions and 4 deletions
50
.github/workflows/make_files_tree.yml
vendored
Normal file
50
.github/workflows/make_files_tree.yml
vendored
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
name: Fetch new content of tracked links to files
|
||||||
|
|
||||||
|
on:
|
||||||
|
schedule:
|
||||||
|
- cron: '0 * * * *'
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main-test
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
make_tracked_links_file:
|
||||||
|
name: Make files tree
|
||||||
|
runs-on: macos-10.15
|
||||||
|
|
||||||
|
steps:
|
||||||
|
|
||||||
|
- name: Clone.
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
with:
|
||||||
|
submodules: recursive
|
||||||
|
|
||||||
|
- name: Setup Python.
|
||||||
|
uses: actions/setup-python@v2
|
||||||
|
with:
|
||||||
|
python-version: 3.9
|
||||||
|
|
||||||
|
- name: Install dependencies.
|
||||||
|
run: |
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
- name: Generate files tree.
|
||||||
|
env:
|
||||||
|
OUTPUT_FOLDER: "data_ci/"
|
||||||
|
run: |
|
||||||
|
python make_files_tree.py
|
||||||
|
|
||||||
|
- name: Commit and push changes.
|
||||||
|
run: |
|
||||||
|
git pull
|
||||||
|
git checkout data
|
||||||
|
|
||||||
|
rm -rf data
|
||||||
|
mv data_ci data
|
||||||
|
|
||||||
|
git config --global user.email "github-action@users.noreply.github.com"
|
||||||
|
git config --global user.name "GitHub Action"
|
||||||
|
|
||||||
|
git add .
|
||||||
|
git commit -m "Update tracked links"
|
||||||
|
git push
|
|
@ -37,7 +37,6 @@ jobs:
|
||||||
- name: Commit and push changes.
|
- name: Commit and push changes.
|
||||||
run: |
|
run: |
|
||||||
git pull
|
git pull
|
||||||
git checkout data
|
|
||||||
|
|
||||||
mv tracked_links_ci.txt tracked_links.txt
|
mv tracked_links_ci.txt tracked_links.txt
|
||||||
|
|
||||||
|
|
61
make_files_tree.py
Normal file
61
make_files_tree.py
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from asyncio.exceptions import TimeoutError
|
||||||
|
from string import punctuation, whitespace
|
||||||
|
from time import time
|
||||||
|
|
||||||
|
import aiofiles
|
||||||
|
import aiohttp
|
||||||
|
from aiohttp import ClientConnectorError
|
||||||
|
|
||||||
|
PROTOCOL = 'https://'
|
||||||
|
ILLEGAL_PATH_CHARS = punctuation.replace('.', '') + whitespace
|
||||||
|
|
||||||
|
INPUT_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_links.txt')
|
||||||
|
OUTPUT_FOLDER = os.environ.get('OUTPUT_FOLDER', 'data/')
|
||||||
|
|
||||||
|
# unsecure but so simple
|
||||||
|
CONNECTOR = aiohttp.TCPConnector(ssl=False)
|
||||||
|
TIMEOUT = aiohttp.ClientTimeout(total=30)
|
||||||
|
|
||||||
|
logging.basicConfig(format='%(message)s', level=logging.DEBUG)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def crawl(url: str, session: aiohttp.ClientSession):
|
||||||
|
try:
|
||||||
|
logger.info(f'Process {url}')
|
||||||
|
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False) as response:
|
||||||
|
if response.status != 200:
|
||||||
|
return
|
||||||
|
|
||||||
|
# bypass external slashes and so on
|
||||||
|
url_parts = [p for p in url.split('/') if p not in ILLEGAL_PATH_CHARS]
|
||||||
|
# handle pure domains and html pages without ext in url
|
||||||
|
ext = '.html' if '.' not in url_parts[-1] or len(url_parts) == 1 else ''
|
||||||
|
|
||||||
|
filename = OUTPUT_FOLDER + '/'.join(url_parts) + ext
|
||||||
|
|
||||||
|
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
||||||
|
async with aiofiles.open(filename, 'w') as f:
|
||||||
|
logger.info(f'Write to {filename}')
|
||||||
|
content = await response.text()
|
||||||
|
await f.write(content)
|
||||||
|
except (TimeoutError, ClientConnectorError):
|
||||||
|
await asyncio.gather(crawl(url, session))
|
||||||
|
|
||||||
|
|
||||||
|
async def start(url_list: set[str]):
|
||||||
|
async with aiohttp.ClientSession(connector=CONNECTOR) as session:
|
||||||
|
await asyncio.gather(*[crawl(url, session) for url in url_list])
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
with open(INPUT_FILENAME, 'r') as f:
|
||||||
|
tracked_urls = set([l.replace('\n', '') for l in f.readlines()])
|
||||||
|
|
||||||
|
logger.info(f'Start crawling content of {len(tracked_urls)} tracked urls...')
|
||||||
|
start_time = time()
|
||||||
|
asyncio.get_event_loop().run_until_complete(start(tracked_urls))
|
||||||
|
logger.info(f'Stop crawling content. {time() - start_time} sec.')
|
|
@ -19,6 +19,7 @@ HIDDEN_URLS = {
|
||||||
'telegram.org/privacy/gmailbot',
|
'telegram.org/privacy/gmailbot',
|
||||||
'telegram.org/tos',
|
'telegram.org/tos',
|
||||||
'telegram.org/tour',
|
'telegram.org/tour',
|
||||||
|
'telegram.org/evolution',
|
||||||
|
|
||||||
'desktop.telegram.org/changelog',
|
'desktop.telegram.org/changelog',
|
||||||
}
|
}
|
||||||
|
@ -171,6 +172,10 @@ def cleanup_links(links: set[str]) -> set[str]:
|
||||||
|
|
||||||
|
|
||||||
async def crawl(url: str, session: aiohttp.ClientSession):
|
async def crawl(url: str, session: aiohttp.ClientSession):
|
||||||
|
# todo
|
||||||
|
if url.endswith('.'):
|
||||||
|
return
|
||||||
|
|
||||||
without_trailing_slash = url[:-1:] if url.endswith('/') else url
|
without_trailing_slash = url[:-1:] if url.endswith('/') else url
|
||||||
if without_trailing_slash in VISITED_LINKS:
|
if without_trailing_slash in VISITED_LINKS:
|
||||||
return
|
return
|
||||||
|
@ -228,10 +233,10 @@ async def start(url_list: set[str]):
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
HIDDEN_URLS.add(BASE_URL)
|
HIDDEN_URLS.add(BASE_URL)
|
||||||
|
|
||||||
logger.info('Start crawling...')
|
logger.info('Start crawling links...')
|
||||||
start_time = time()
|
start_time = time()
|
||||||
asyncio.get_event_loop().run_until_complete(start(HIDDEN_URLS))
|
asyncio.get_event_loop().run_until_complete(start(HIDDEN_URLS))
|
||||||
logger.info(f'Stop crawling. {time() - start_time} sec.')
|
logger.info(f'Stop crawling links. {time() - start_time} sec.')
|
||||||
|
|
||||||
with open(OUTPUT_FILENAME, 'w') as f:
|
with open(OUTPUT_FILENAME, 'w') as f:
|
||||||
f.write('\n'.join(sorted(LINKS_TO_TRACK)))
|
f.write('\n'.join(sorted(LINKS_TO_TRACK)))
|
||||||
|
|
|
@ -1 +1,2 @@
|
||||||
aiohttp==3.7.4.post0
|
aiohttp==3.7.4.post0
|
||||||
|
aiofiles==0.6.0
|
Loading…
Reference in a new issue