fix, improve and cleanup links crawler; add diff or files

This commit is contained in:
Il'ya (Marshal) 2021-04-24 22:40:25 +02:00
parent b944e999cb
commit dcd5adc08a
2 changed files with 34 additions and 31 deletions

View file

@ -55,7 +55,8 @@ Copy of Telegram websites stored **[here](https://github.com/MarshalX/telegram-c
### TODO list
- bug fixes;
- alert system.
- alert system;
- add storing hashes of image, svg, video.
### Example of link crawler rules configuration

View file

@ -77,8 +77,8 @@ CRAWL_RULES = {
}
DIRECT_LINK_REGEX = r'([-a-zA-Z0-9@:%._\+~#]{0,249}' + BASE_URL_REGEX + r')'
ABSOLUTE_LINK_REGEX = r'([-a-zA-Z0-9@:%._\+~#]{1,249}' + BASE_URL_REGEX + r'\b[-a-zA-Z0-9@:%_\+.~#?&\/\/=]*)'
RELATIVE_LINK_REGEX = r'\/([-a-zA-Z0-9\/@:%._\+~#]{0,249})'
ABSOLUTE_LINK_REGEX = r'([-a-zA-Z0-9@:%._\+~#]{0,248}' + BASE_URL_REGEX + r'\b[-a-zA-Z0-9@:%_\+.~#?&//=]*)'
RELATIVE_LINK_REGEX = r'\/(?!\/)([-a-zA-Z0-9\/@:%._\+~#]{0,249})'
DOM_ATTRS = ['href', 'src']
@ -137,16 +137,9 @@ def find_relative_links(html: str, cur_link: str) -> set[str]:
links = re.findall(regex, html)
for link in links:
# bypass //www.apple and etc shit ;d
if link.startswith('/'):
# absolute links starting with double slash
if find_absolute_links(link):
if not should_exclude(link[1::]):
relative_links.add(link[1::])
else:
url = f'{direct_cur_link}/{link}'
if not should_exclude(url):
relative_links.add(url)
url = f'{direct_cur_link}/{link}'
if not should_exclude(url):
relative_links.add(url)
return relative_links
@ -159,10 +152,16 @@ def cleanup_links(links: set[str]) -> set[str]:
link = unescape(link)
link = link.replace('www.', '')
link = link.replace('http://', '').replace('https://', '')
# skip anchor links
if '#' in link:
continue
# remove get params from link
if '?' in link:
link = ''.join(link.split('?')[:-1])
# skip mailto:
link_parts = link.split('.')
if '@' in link_parts[0]:
continue
@ -173,14 +172,9 @@ def cleanup_links(links: set[str]) -> set[str]:
async def crawl(url: str, session: aiohttp.ClientSession):
# todo
if url.endswith('.'):
if url in VISITED_LINKS:
return
without_trailing_slash = url[:-1:] if url.endswith('/') else url
if without_trailing_slash in VISITED_LINKS:
return
VISITED_LINKS.add(without_trailing_slash)
VISITED_LINKS.add(url)
try:
logger.info(f'[{len(VISITED_LINKS)}] Process {url}')
@ -188,17 +182,6 @@ async def crawl(url: str, session: aiohttp.ClientSession):
status_code = response.status
content_type = response.headers.get('content-type')
# if it was redirect to link with trailing slash - handle this url
if 300 < status_code < 400:
location = response.headers.get('location', '')
# todo rewrite logic
if without_trailing_slash in location:
if not should_exclude(location):
# nice shit bro
logger.info(f'Trailing slash. {location}')
cleaned_link = list(cleanup_links({location}))[0]
await asyncio.gather(crawl(cleaned_link, session))
if status_code != 200:
return
@ -220,9 +203,18 @@ async def crawl(url: str, session: aiohttp.ClientSession):
else:
# TODO track hashes of image/svg/video content types
logger.info(f'Unhandled type: {content_type}')
# telegram url can work with and without trailing slash (no redirect). P.S. not on every sub domain ;d
# so this is a problem when we have random behavior with link will be added
# this if resolve this issue. If available both link we prefer without trailing slash
without_trailing_slash = url[:-1:] if url.endswith('/') else url
if without_trailing_slash in LINKS_TO_TRACK and \
f'{without_trailing_slash}/' in LINKS_TO_TRACK:
LINKS_TO_TRACK.remove(f'{without_trailing_slash}/')
except UnicodeDecodeError:
logger.warning('Codec can\'t decode byte. So its was a tgs file')
except (TimeoutError, ClientConnectorError):
logger.warning(f'Retrying {url}')
await asyncio.gather(crawl(url, session))
@ -239,5 +231,15 @@ if __name__ == '__main__':
asyncio.get_event_loop().run_until_complete(start(HIDDEN_URLS))
logger.info(f'Stop crawling links. {time() - start_time} sec.')
try:
with open(OUTPUT_FILENAME, 'r') as f:
OLD_URL_LIST = set([l.replace('\n', '') for l in f.readlines()])
logger.info(f'Is equal: {OLD_URL_LIST == LINKS_TO_TRACK}')
logger.info(f'Deleted: {OLD_URL_LIST - LINKS_TO_TRACK}')
logger.info(f'Added: {LINKS_TO_TRACK - OLD_URL_LIST}')
except IOError:
pass
with open(OUTPUT_FILENAME, 'w') as f:
f.write('\n'.join(sorted(LINKS_TO_TRACK)))