mirror of
https://github.com/MarshalX/telegram-crawler.git
synced 2024-11-22 07:19:34 +01:00
fix performance and memory issues
This commit is contained in:
parent
d449290dfb
commit
246a0087e1
1 changed files with 5 additions and 8 deletions
|
@ -67,7 +67,6 @@ OUTPUT_FILENAME = os.environ.get('OUTPUT_FILENAME', 'tracked_links.txt')
|
||||||
|
|
||||||
# unsecure but so simple
|
# unsecure but so simple
|
||||||
CONNECTOR = aiohttp.TCPConnector(ssl=False)
|
CONNECTOR = aiohttp.TCPConnector(ssl=False)
|
||||||
SESSION = aiohttp.ClientSession(connector=CONNECTOR)
|
|
||||||
|
|
||||||
logging.basicConfig(format='%(message)s', level=logging.DEBUG)
|
logging.basicConfig(format='%(message)s', level=logging.DEBUG)
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
@ -152,7 +151,7 @@ def cleanup_links(links: set[str]) -> set[str]:
|
||||||
return cleaned_links
|
return cleaned_links
|
||||||
|
|
||||||
|
|
||||||
async def crawl(url: str):
|
async def crawl(url: str, session: aiohttp.ClientSession):
|
||||||
if url.endswith('/'):
|
if url.endswith('/'):
|
||||||
url = url[:-1:]
|
url = url[:-1:]
|
||||||
if url in VISITED_LINKS or '"' in url:
|
if url in VISITED_LINKS or '"' in url:
|
||||||
|
@ -161,7 +160,7 @@ async def crawl(url: str):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
logger.info(f'[{len(VISITED_LINKS)}] Process {url}')
|
logger.info(f'[{len(VISITED_LINKS)}] Process {url}')
|
||||||
async with SESSION.get(f'{PROTOCOL}{url}', allow_redirects=False) as response:
|
async with session.get(f'{PROTOCOL}{url}', allow_redirects=False) as response:
|
||||||
status_code = response.status
|
status_code = response.status
|
||||||
content_type = response.headers.get('content-type')
|
content_type = response.headers.get('content-type')
|
||||||
|
|
||||||
|
@ -176,8 +175,7 @@ async def crawl(url: str):
|
||||||
relative_links = cleanup_links(find_relative_links(html, url))
|
relative_links = cleanup_links(find_relative_links(html, url))
|
||||||
|
|
||||||
sub_links = absolute_links | relative_links
|
sub_links = absolute_links | relative_links
|
||||||
for link in sub_links:
|
await asyncio.gather(*[crawl(url, session) for url in sub_links])
|
||||||
await asyncio.create_task(crawl(link))
|
|
||||||
elif 'application/javascript' in content_type:
|
elif 'application/javascript' in content_type:
|
||||||
LINKS_TO_TRACK.add(url)
|
LINKS_TO_TRACK.add(url)
|
||||||
elif 'text/css' in content_type:
|
elif 'text/css' in content_type:
|
||||||
|
@ -192,15 +190,14 @@ async def crawl(url: str):
|
||||||
|
|
||||||
|
|
||||||
async def start(url_list: set[str]):
|
async def start(url_list: set[str]):
|
||||||
for url in url_list:
|
async with aiohttp.ClientSession(connector=CONNECTOR) as session:
|
||||||
await asyncio.create_task(crawl(url))
|
await asyncio.gather(*[crawl(url, session) for url in url_list])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
HIDDEN_URLS.add(BASE_URL)
|
HIDDEN_URLS.add(BASE_URL)
|
||||||
|
|
||||||
asyncio.get_event_loop().run_until_complete(start(HIDDEN_URLS))
|
asyncio.get_event_loop().run_until_complete(start(HIDDEN_URLS))
|
||||||
asyncio.get_event_loop().run_until_complete(SESSION.close())
|
|
||||||
|
|
||||||
with open(OUTPUT_FILENAME, 'w') as f:
|
with open(OUTPUT_FILENAME, 'w') as f:
|
||||||
f.write('\n'.join(sorted(LINKS_TO_TRACK)))
|
f.write('\n'.join(sorted(LINKS_TO_TRACK)))
|
||||||
|
|
Loading…
Reference in a new issue