delete dynamic part of html

This commit is contained in:
Il'ya (Marshal) 2021-04-24 14:38:39 +02:00
parent e5f7c10f58
commit 3bd6888cbd
2 changed files with 7 additions and 2 deletions

View file

@ -46,5 +46,5 @@ jobs:
git config --global user.name "GitHub Action"
git add .
git commit -m "Update tracked links"
git commit -m "Update content of files"
git push

View file

@ -1,6 +1,7 @@
import asyncio
import logging
import os
import re
from asyncio.exceptions import TimeoutError
from string import punctuation, whitespace
from time import time
@ -15,6 +16,8 @@ ILLEGAL_PATH_CHARS = punctuation.replace('.', '') + whitespace
INPUT_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_links.txt')
OUTPUT_FOLDER = os.environ.get('OUTPUT_FOLDER', 'data/')
PAGE_GENERATION_TIME_REGEX = r'<!-- page generated in .+ -->'
# unsecure but so simple
CONNECTOR = aiohttp.TCPConnector(ssl=False)
TIMEOUT = aiohttp.ClientTimeout(total=30)
@ -39,8 +42,10 @@ async def crawl(url: str, session: aiohttp.ClientSession):
os.makedirs(os.path.dirname(filename), exist_ok=True)
async with aiofiles.open(filename, 'w') as f:
logger.info(f'Write to {filename}')
content = await response.text()
content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
logger.info(f'Write to {filename}')
await f.write(content)
except (TimeoutError, ClientConnectorError):
await asyncio.gather(crawl(url, session))