mirror of
https://github.com/MarshalX/telegram-crawler.git
synced 2024-11-22 07:19:34 +01:00
print lead time
This commit is contained in:
parent
415a9e9b6c
commit
e385f45da6
1 changed files with 6 additions and 2 deletions
|
@ -1,8 +1,9 @@
|
||||||
import os
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
from html import unescape
|
from html import unescape
|
||||||
|
from time import time
|
||||||
from urllib.parse import unquote
|
from urllib.parse import unquote
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
|
@ -186,7 +187,7 @@ async def crawl(url: str, session: aiohttp.ClientSession):
|
||||||
# TODO track hashes of image/svg/video content types
|
# TODO track hashes of image/svg/video content types
|
||||||
logger.info(f'Unhandled type: {content_type}')
|
logger.info(f'Unhandled type: {content_type}')
|
||||||
except:
|
except:
|
||||||
logger.error('Codec can\'t decode byte. So its was a tgs file')
|
logger.warning('Codec can\'t decode byte. So its was a tgs file')
|
||||||
|
|
||||||
|
|
||||||
async def start(url_list: set[str]):
|
async def start(url_list: set[str]):
|
||||||
|
@ -197,7 +198,10 @@ async def start(url_list: set[str]):
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
HIDDEN_URLS.add(BASE_URL)
|
HIDDEN_URLS.add(BASE_URL)
|
||||||
|
|
||||||
|
logger.info('Start crawling...')
|
||||||
|
start_time = time()
|
||||||
asyncio.get_event_loop().run_until_complete(start(HIDDEN_URLS))
|
asyncio.get_event_loop().run_until_complete(start(HIDDEN_URLS))
|
||||||
|
logger.info(f'Stop crawling. {time() - start_time} sec.')
|
||||||
|
|
||||||
with open(OUTPUT_FILENAME, 'w') as f:
|
with open(OUTPUT_FILENAME, 'w') as f:
|
||||||
f.write('\n'.join(sorted(LINKS_TO_TRACK)))
|
f.write('\n'.join(sorted(LINKS_TO_TRACK)))
|
||||||
|
|
Loading…
Reference in a new issue