mirror of
https://github.com/MarshalX/telegram-crawler.git
synced 2025-03-15 13:22:43 +01:00
sub dynamic api hash from html
This commit is contained in:
parent
ac628bb231
commit
e45fa2c2c2
2 changed files with 4 additions and 1 deletions
2
.github/workflows/make_files_tree.yml
vendored
2
.github/workflows/make_files_tree.yml
vendored
|
@ -2,7 +2,7 @@ name: Fetch new content of tracked links to files
|
|||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '*/5 * * * * '
|
||||
- cron: '* * * * * '
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
|
|
@ -17,6 +17,8 @@ INPUT_FILENAME = os.environ.get('INPUT_FILENAME', 'tracked_links.txt')
|
|||
OUTPUT_FOLDER = os.environ.get('OUTPUT_FOLDER', 'data/')
|
||||
|
||||
PAGE_GENERATION_TIME_REGEX = r'<!-- page generated in .+ -->'
|
||||
PAGE_API_HASH_REGEX = r'api\?hash=.+",'
|
||||
PAGE_API_HASH_TEMPLATE = r'api?hash=telegram-crawler",'
|
||||
|
||||
# unsecure but so simple
|
||||
CONNECTOR = aiohttp.TCPConnector(ssl=False)
|
||||
|
@ -44,6 +46,7 @@ async def crawl(url: str, session: aiohttp.ClientSession):
|
|||
async with aiofiles.open(filename, 'w') as f:
|
||||
content = await response.text()
|
||||
content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
|
||||
content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content)
|
||||
|
||||
logger.info(f'Write to {filename}')
|
||||
await f.write(content)
|
||||
|
|
Loading…
Add table
Reference in a new issue