fix performance and memory issues

2024-11-22 07:19:34 +01:00 · 2021-04-23 22:38:54 +02:00 · 2021-04-23 22:38:54 +02:00 · 246a0087e1
commit 246a0087e1
parent d449290dfb
1 changed files with 5 additions and 8 deletions
--- a/make_tracked_links_list.py
+++ b/make_tracked_links_list.py
@ -67,7 +67,6 @@ OUTPUT_FILENAME = os.environ.get('OUTPUT_FILENAME', 'tracked_links.txt')
 # unsecure but so simple
 CONNECTOR = aiohttp.TCPConnector(ssl=False)
 SESSION = aiohttp.ClientSession(connector=CONNECTOR)
 logging.basicConfig(format='%(message)s', level=logging.DEBUG)
 logger = logging.getLogger(__name__)
@ -152,7 +151,7 @@ def cleanup_links(links: set[str]) -> set[str]:
    return cleaned_links
-async def crawl(url: str):
+async def crawl(url: str, session: aiohttp.ClientSession):
    if url.endswith('/'):
        url = url[:-1:]
    if url in VISITED_LINKS or '"' in url:
@ -161,7 +160,7 @@ async def crawl(url: str):
    try:
        logger.info(f'[{len(VISITED_LINKS)}] Process {url}')
-        async with SESSION.get(f'{PROTOCOL}{url}', allow_redirects=False) as response:
+        async with session.get(f'{PROTOCOL}{url}', allow_redirects=False) as response:
            status_code = response.status
            content_type = response.headers.get('content-type')
@ -176,8 +175,7 @@ async def crawl(url: str):
                relative_links = cleanup_links(find_relative_links(html, url))
                sub_links = absolute_links | relative_links
-                for link in sub_links:
+                await asyncio.gather(*[crawl(url, session) for url in sub_links])
                    await asyncio.create_task(crawl(link))
            elif 'application/javascript' in content_type:
                LINKS_TO_TRACK.add(url)
            elif 'text/css' in content_type:
@ -192,15 +190,14 @@ async def crawl(url: str):
 async def start(url_list: set[str]):
-    for url in url_list:
+    async with aiohttp.ClientSession(connector=CONNECTOR) as session:
-        await asyncio.create_task(crawl(url))
+        await asyncio.gather(*[crawl(url, session) for url in url_list])
 if __name__ == '__main__':
    HIDDEN_URLS.add(BASE_URL)
    asyncio.get_event_loop().run_until_complete(start(HIDDEN_URLS))
    asyncio.get_event_loop().run_until_complete(SESSION.close())
    with open(OUTPUT_FILENAME, 'w') as f:
        f.write('\n'.join(sorted(LINKS_TO_TRACK)))