mirror of
https://github.com/MarshalX/telegram-crawler.git
synced 2024-11-22 07:19:34 +01:00
rework exclude system; add exclude by count of slashes; add translations.telegram.org
This commit is contained in:
parent
e385f45da6
commit
b2afbd72fc
1 changed files with 73 additions and 53 deletions
|
@ -16,45 +16,66 @@ HIDDEN_URLS = {
|
|||
|
||||
'telegram.org/privacy/gmailbot',
|
||||
'telegram.org/tos',
|
||||
'telegram.org/tour'
|
||||
'telegram.org/tour',
|
||||
|
||||
'translations.telegram.org',
|
||||
'translations.telegram.org/en/android',
|
||||
'translations.telegram.org/en/ios',
|
||||
'translations.telegram.org/en/tdesktop',
|
||||
'translations.telegram.org/en/macos',
|
||||
'translations.telegram.org/en/android_x',
|
||||
}
|
||||
BASE_URL_REGEX = r'telegram.org'
|
||||
|
||||
# disable crawling sub links for specific domains and url patches
|
||||
EXCLUDE_RULES = {
|
||||
# '' means exclude all
|
||||
# '*' means exclude all
|
||||
'translations.telegram.org': {
|
||||
'',
|
||||
# 'max_count_of_slashes': 3,
|
||||
'patches': {
|
||||
'*',
|
||||
}
|
||||
},
|
||||
'bugs.telegram.org': {
|
||||
'c/',
|
||||
'patches': {
|
||||
'c/',
|
||||
},
|
||||
},
|
||||
'instantview.telegram.org': {
|
||||
'file/',
|
||||
'patches': {
|
||||
'file/',
|
||||
|
||||
'templates/',
|
||||
'samples/',
|
||||
'contest/',
|
||||
'templates/',
|
||||
'samples/',
|
||||
'contest/',
|
||||
},
|
||||
},
|
||||
'corefork.telegram.org': {
|
||||
'file/',
|
||||
'patches': {
|
||||
'file/',
|
||||
|
||||
'tdlib/',
|
||||
'tdlib/docs/',
|
||||
|
||||
'constructor/',
|
||||
'method/',
|
||||
'type/',
|
||||
'constructor/',
|
||||
'method/',
|
||||
'type/',
|
||||
},
|
||||
},
|
||||
'core.telegram.org': {
|
||||
'file/',
|
||||
'patches': {
|
||||
'file/',
|
||||
|
||||
'tdlib/',
|
||||
'tdlib/docs/',
|
||||
|
||||
'constructor/',
|
||||
'method/',
|
||||
'type/',
|
||||
'constructor/',
|
||||
'method/',
|
||||
'type/',
|
||||
},
|
||||
},
|
||||
'telegram.org': {
|
||||
'file/',
|
||||
'patches': {
|
||||
'file/',
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -76,35 +97,37 @@ VISITED_LINKS = set()
|
|||
LINKS_TO_TRACK = set()
|
||||
|
||||
|
||||
def should_exclude(url: str, direct_link=None) -> bool:
|
||||
if not direct_link:
|
||||
direct_link = re.findall(DIRECT_LINK_REGEX, url)[0]
|
||||
domain_exclude_rules = EXCLUDE_RULES.get(direct_link, dict())
|
||||
|
||||
max_count_of_slashes = domain_exclude_rules.get('max_count_of_slashes')
|
||||
exclude_patches = domain_exclude_rules.get('patches', set())
|
||||
|
||||
if '*' in exclude_patches:
|
||||
return True
|
||||
|
||||
if max_count_of_slashes and max_count_of_slashes < url.count('/'):
|
||||
return True
|
||||
|
||||
for path in exclude_patches:
|
||||
if path in url:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def find_absolute_links(html: str) -> set[str]:
|
||||
absolute_links = set(re.findall(ABSOLUTE_LINK_REGEX, html))
|
||||
|
||||
filtered_links = set()
|
||||
for link in absolute_links:
|
||||
def _():
|
||||
direct_link = re.findall(DIRECT_LINK_REGEX, link)[0]
|
||||
exceptions = EXCLUDE_RULES.get(direct_link, set())
|
||||
# optimization. when we want to exclude domain
|
||||
if '' in exceptions:
|
||||
return
|
||||
|
||||
for exclude_path in exceptions:
|
||||
if exclude_path in link:
|
||||
return
|
||||
|
||||
filtered_links.add(link)
|
||||
|
||||
_()
|
||||
# Yeah, I don't care about DRY
|
||||
|
||||
return filtered_links
|
||||
return {link for link in absolute_links if not should_exclude(link)}
|
||||
|
||||
|
||||
def find_relative_links(html: str, cur_link: str) -> set[str]:
|
||||
direct_cur_link = re.findall(DIRECT_LINK_REGEX, cur_link)[0]
|
||||
exceptions = EXCLUDE_RULES.get(direct_cur_link, set())
|
||||
# optimization. when we want to exclude domain
|
||||
if '' in exceptions:
|
||||
if should_exclude(cur_link):
|
||||
return set()
|
||||
|
||||
relative_links = set()
|
||||
|
@ -113,20 +136,17 @@ def find_relative_links(html: str, cur_link: str) -> set[str]:
|
|||
links = re.findall(regex, html)
|
||||
|
||||
for link in links:
|
||||
def _():
|
||||
for exclude_path in exceptions:
|
||||
if exclude_path in link:
|
||||
return
|
||||
if should_exclude(link, direct_cur_link):
|
||||
continue
|
||||
|
||||
if link.startswith('/'):
|
||||
# bypass //www.apple and etc shit ;d
|
||||
if find_absolute_links(link):
|
||||
# absolute links starting with double slash
|
||||
if link.startswith('/'):
|
||||
# bypass //www.apple and etc shit ;d
|
||||
if find_absolute_links(link):
|
||||
# absolute links starting with double slash
|
||||
if not should_exclude(link):
|
||||
relative_links.add(link[1::])
|
||||
else:
|
||||
relative_links.add(f'{direct_cur_link}/{link}')
|
||||
|
||||
_()
|
||||
else:
|
||||
relative_links.add(f'{direct_cur_link}/{link}')
|
||||
|
||||
return relative_links
|
||||
|
||||
|
@ -187,7 +207,7 @@ async def crawl(url: str, session: aiohttp.ClientSession):
|
|||
# TODO track hashes of image/svg/video content types
|
||||
logger.info(f'Unhandled type: {content_type}')
|
||||
except:
|
||||
logger.warning('Codec can\'t decode byte. So its was a tgs file')
|
||||
logger.warning('Mb codec can\'t decode byte. So its was a tgs file')
|
||||
|
||||
|
||||
async def start(url_list: set[str]):
|
||||
|
|
Loading…
Reference in a new issue