sub dynamic api hash from html

This commit is contained in:
Il'ya (Marshal) 2021-04-24 15:01:41 +02:00
parent ac628bb231
commit e45fa2c2c2
2 changed files with 4 additions and 1 deletions

View file

@ -2,7 +2,7 @@ name: Fetch new content of tracked links to files
on:
schedule:
- cron: '*/5 * * * * '
- cron: '* * * * * '
push:
branches:
- main

View file

@ -17,6 +17,8 @@ INPUT_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_links.txt')
OUTPUT_FOLDER = os.environ.get('OUTPUT_FOLDER', 'data/')
PAGE_GENERATION_TIME_REGEX = r'<!-- page generated in .+ -->'
PAGE_API_HASH_REGEX = r'api\?hash=.+",'
PAGE_API_HASH_TEMPLATE = r'api?hash=telegram-crawler",'
# unsecure but so simple
CONNECTOR = aiohttp.TCPConnector(ssl=False)
@ -44,6 +46,7 @@ async def crawl(url: str, session: aiohttp.ClientSession):
async with aiofiles.open(filename, 'w') as f:
content = await response.text()
content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content)
logger.info(f'Write to {filename}')
await f.write(content)