mirror of
https://github.com/MarshalX/telegram-crawler.git
synced 2025-01-11 12:41:37 +01:00
delete dynamic part of html
This commit is contained in:
parent
e5f7c10f58
commit
3bd6888cbd
2 changed files with 7 additions and 2 deletions
2
.github/workflows/make_files_tree.yml
vendored
2
.github/workflows/make_files_tree.yml
vendored
|
@ -46,5 +46,5 @@ jobs:
|
|||
git config --global user.name "GitHub Action"
|
||||
|
||||
git add .
|
||||
git commit -m "Update tracked links"
|
||||
git commit -m "Update content of files"
|
||||
git push
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from asyncio.exceptions import TimeoutError
|
||||
from string import punctuation, whitespace
|
||||
from time import time
|
||||
|
@ -15,6 +16,8 @@ ILLEGAL_PATH_CHARS = punctuation.replace('.', '') + whitespace
|
|||
INPUT_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_links.txt')
|
||||
OUTPUT_FOLDER = os.environ.get('OUTPUT_FOLDER', 'data/')
|
||||
|
||||
PAGE_GENERATION_TIME_REGEX = r'<!-- page generated in .+ -->'
|
||||
|
||||
# unsecure but so simple
|
||||
CONNECTOR = aiohttp.TCPConnector(ssl=False)
|
||||
TIMEOUT = aiohttp.ClientTimeout(total=30)
|
||||
|
@ -39,8 +42,10 @@ async def crawl(url: str, session: aiohttp.ClientSession):
|
|||
|
||||
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
||||
async with aiofiles.open(filename, 'w') as f:
|
||||
logger.info(f'Write to {filename}')
|
||||
content = await response.text()
|
||||
content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
|
||||
|
||||
logger.info(f'Write to {filename}')
|
||||
await f.write(content)
|
||||
except (TimeoutError, ClientConnectorError):
|
||||
await asyncio.gather(crawl(url, session))
|
||||
|
|
Loading…
Reference in a new issue