mirror of
https://github.com/MarshalX/telegram-crawler.git
synced 2025-01-23 01:12:02 +01:00
add crawling of scripts from other scripts. close #2
This commit is contained in:
parent
4dfa5686f0
commit
cb5e8c1069
1 changed files with 33 additions and 5 deletions
|
@ -129,6 +129,7 @@ CRAWL_RULES = {
|
||||||
DIRECT_LINK_REGEX = r'([-a-zA-Z0-9@:%._\+~#]{0,249}' + BASE_URL_REGEX + r')'
|
DIRECT_LINK_REGEX = r'([-a-zA-Z0-9@:%._\+~#]{0,249}' + BASE_URL_REGEX + r')'
|
||||||
ABSOLUTE_LINK_REGEX = r'([-a-zA-Z0-9@:%._\+~#]{0,248}' + BASE_URL_REGEX + r'\b[-a-zA-Z0-9@:%_\+.~#?&//=]*)'
|
ABSOLUTE_LINK_REGEX = r'([-a-zA-Z0-9@:%._\+~#]{0,248}' + BASE_URL_REGEX + r'\b[-a-zA-Z0-9@:%_\+.~#?&//=]*)'
|
||||||
RELATIVE_LINK_REGEX = r'\/(?!\/)([-a-zA-Z0-9\/@:%._\+~#]{0,249})'
|
RELATIVE_LINK_REGEX = r'\/(?!\/)([-a-zA-Z0-9\/@:%._\+~#]{0,249})'
|
||||||
|
RELATIVE_JS_SCRIPTS_REGEX = r'["\'](.*\.js)["\'\?]'
|
||||||
|
|
||||||
DOM_ATTRS = ['href', 'src']
|
DOM_ATTRS = ['href', 'src']
|
||||||
|
|
||||||
|
@ -199,6 +200,28 @@ def find_relative_links(html: str, cur_link: str) -> set[str]:
|
||||||
return relative_links
|
return relative_links
|
||||||
|
|
||||||
|
|
||||||
|
def find_relative_scripts(code: str, cur_link: str) -> set[str]:
|
||||||
|
matches = re.findall(DIRECT_LINK_REGEX, cur_link)
|
||||||
|
if not matches:
|
||||||
|
return set()
|
||||||
|
|
||||||
|
direct_cur_link = re.findall(DIRECT_LINK_REGEX, cur_link)[0]
|
||||||
|
|
||||||
|
relative_links = set()
|
||||||
|
for link in re.findall(RELATIVE_JS_SCRIPTS_REGEX, code):
|
||||||
|
# dirty magic for specific cases
|
||||||
|
if '/' in link: # path to file from the root
|
||||||
|
url = f'{direct_cur_link}/{link}'
|
||||||
|
else: # its relative link from current folder. Not from the root
|
||||||
|
current_folder_link, *_ = cur_link.rsplit('/', 1)
|
||||||
|
url = f'{current_folder_link}/{link}'
|
||||||
|
|
||||||
|
if not should_exclude(url):
|
||||||
|
relative_links.add(url)
|
||||||
|
|
||||||
|
return relative_links
|
||||||
|
|
||||||
|
|
||||||
def cleanup_links(links: set[str]) -> set[str]:
|
def cleanup_links(links: set[str]) -> set[str]:
|
||||||
cleaned_links = set()
|
cleaned_links = set()
|
||||||
for tmp_link in links:
|
for tmp_link in links:
|
||||||
|
@ -207,6 +230,7 @@ def cleanup_links(links: set[str]) -> set[str]:
|
||||||
link = unescape(link)
|
link = unescape(link)
|
||||||
link = link.replace('www.', '')
|
link = link.replace('www.', '')
|
||||||
link = link.replace('http://', '').replace('https://', '')
|
link = link.replace('http://', '').replace('https://', '')
|
||||||
|
link = link.replace('//', '/') # not a universal solution
|
||||||
|
|
||||||
# skip anchor links
|
# skip anchor links
|
||||||
if '#' in link:
|
if '#' in link:
|
||||||
|
@ -228,7 +252,6 @@ def cleanup_links(links: set[str]) -> set[str]:
|
||||||
|
|
||||||
def is_trackable_content_type(content_type) -> bool:
|
def is_trackable_content_type(content_type) -> bool:
|
||||||
trackable_content_types = (
|
trackable_content_types = (
|
||||||
'javascript',
|
|
||||||
'css',
|
'css',
|
||||||
'plain',
|
'plain',
|
||||||
'json',
|
'json',
|
||||||
|
@ -269,12 +292,17 @@ async def crawl(url: str, session: aiohttp.ClientSession):
|
||||||
logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}')
|
logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}')
|
||||||
return
|
return
|
||||||
|
|
||||||
if 'text' in content_type:
|
if 'text' in content_type or 'javascript' in content_type:
|
||||||
LINKS_TO_TRACK.add(url)
|
LINKS_TO_TRACK.add(url)
|
||||||
|
|
||||||
html = await response.text(encoding='UTF-8')
|
content = await response.text(encoding='UTF-8')
|
||||||
absolute_links = cleanup_links(find_absolute_links(html))
|
absolute_links = cleanup_links(find_absolute_links(content))
|
||||||
relative_links = cleanup_links(find_relative_links(html, url))
|
|
||||||
|
relative_links_finder = find_relative_links
|
||||||
|
if 'javascript' in content_type:
|
||||||
|
relative_links_finder = find_relative_scripts
|
||||||
|
|
||||||
|
relative_links = cleanup_links(relative_links_finder(content, url))
|
||||||
|
|
||||||
sub_links = absolute_links | relative_links
|
sub_links = absolute_links | relative_links
|
||||||
await asyncio.gather(*[crawl(url, session) for url in sub_links])
|
await asyncio.gather(*[crawl(url, session) for url in sub_links])
|
||||||
|
|
Loading…
Add table
Reference in a new issue