From cb5e8c1069a8cab0b42d14f64895c1ee837dba14 Mon Sep 17 00:00:00 2001 From: "Il'ya (Marshal)" Date: Sun, 10 Apr 2022 19:33:33 +0200 Subject: [PATCH] add crawling of scripts from other scripts. close #2 --- make_tracked_links_list.py | 38 +++++++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/make_tracked_links_list.py b/make_tracked_links_list.py index 5e5451402d..1a08ab75bf 100644 --- a/make_tracked_links_list.py +++ b/make_tracked_links_list.py @@ -129,6 +129,7 @@ CRAWL_RULES = { DIRECT_LINK_REGEX = r'([-a-zA-Z0-9@:%._\+~#]{0,249}' + BASE_URL_REGEX + r')' ABSOLUTE_LINK_REGEX = r'([-a-zA-Z0-9@:%._\+~#]{0,248}' + BASE_URL_REGEX + r'\b[-a-zA-Z0-9@:%_\+.~#?&//=]*)' RELATIVE_LINK_REGEX = r'\/(?!\/)([-a-zA-Z0-9\/@:%._\+~#]{0,249})' +RELATIVE_JS_SCRIPTS_REGEX = r'["\'](.*\.js)["\'\?]' DOM_ATTRS = ['href', 'src'] @@ -199,6 +200,28 @@ def find_relative_links(html: str, cur_link: str) -> set[str]: return relative_links +def find_relative_scripts(code: str, cur_link: str) -> set[str]: + matches = re.findall(DIRECT_LINK_REGEX, cur_link) + if not matches: + return set() + + direct_cur_link = re.findall(DIRECT_LINK_REGEX, cur_link)[0] + + relative_links = set() + for link in re.findall(RELATIVE_JS_SCRIPTS_REGEX, code): + # dirty magic for specific cases + if '/' in link: # path to file from the root + url = f'{direct_cur_link}/{link}' + else: # its relative link from current folder. Not from the root + current_folder_link, *_ = cur_link.rsplit('/', 1) + url = f'{current_folder_link}/{link}' + + if not should_exclude(url): + relative_links.add(url) + + return relative_links + + def cleanup_links(links: set[str]) -> set[str]: cleaned_links = set() for tmp_link in links: @@ -207,6 +230,7 @@ def cleanup_links(links: set[str]) -> set[str]: link = unescape(link) link = link.replace('www.', '') link = link.replace('http://', '').replace('https://', '') + link = link.replace('//', '/') # not a universal solution # skip anchor links if '#' in link: @@ -228,7 +252,6 @@ def cleanup_links(links: set[str]) -> set[str]: def is_trackable_content_type(content_type) -> bool: trackable_content_types = ( - 'javascript', 'css', 'plain', 'json', @@ -269,12 +292,17 @@ async def crawl(url: str, session: aiohttp.ClientSession): logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}') return - if 'text' in content_type: + if 'text' in content_type or 'javascript' in content_type: LINKS_TO_TRACK.add(url) - html = await response.text(encoding='UTF-8') - absolute_links = cleanup_links(find_absolute_links(html)) - relative_links = cleanup_links(find_relative_links(html, url)) + content = await response.text(encoding='UTF-8') + absolute_links = cleanup_links(find_absolute_links(content)) + + relative_links_finder = find_relative_links + if 'javascript' in content_type: + relative_links_finder = find_relative_scripts + + relative_links = cleanup_links(relative_links_finder(content, url)) sub_links = absolute_links | relative_links await asyncio.gather(*[crawl(url, session) for url in sub_links])