mirror of
https://github.com/MarshalX/telegram-crawler.git
synced 2025-03-08 10:24:43 +01:00
add ability to set hidden urls
This commit is contained in:
parent
fba0bd1d64
commit
d449290dfb
1 changed files with 27 additions and 3 deletions
|
@ -9,6 +9,14 @@ import aiohttp
|
|||
|
||||
PROTOCOL = 'https://'
|
||||
BASE_URL = 'telegram.org'
|
||||
# its necessary to help crawler to find more links
|
||||
HIDDEN_URLS = {
|
||||
'corefork.telegram.org',
|
||||
|
||||
'telegram.org/privacy/gmailbot',
|
||||
'telegram.org/tos',
|
||||
'telegram.org/tour'
|
||||
}
|
||||
BASE_URL_REGEX = r'telegram.org'
|
||||
|
||||
EXCLUDE_RULES = {
|
||||
|
@ -26,6 +34,15 @@ EXCLUDE_RULES = {
|
|||
'samples/',
|
||||
'contest/',
|
||||
},
|
||||
'corefork.telegram.org': {
|
||||
'file/',
|
||||
|
||||
'tdlib/',
|
||||
|
||||
'constructor/',
|
||||
'method/',
|
||||
'type/',
|
||||
},
|
||||
'core.telegram.org': {
|
||||
'file/',
|
||||
|
||||
|
@ -135,7 +152,7 @@ def cleanup_links(links: set[str]) -> set[str]:
|
|||
return cleaned_links
|
||||
|
||||
|
||||
async def main(url: str):
|
||||
async def crawl(url: str):
|
||||
if url.endswith('/'):
|
||||
url = url[:-1:]
|
||||
if url in VISITED_LINKS or '"' in url:
|
||||
|
@ -160,7 +177,7 @@ async def main(url: str):
|
|||
|
||||
sub_links = absolute_links | relative_links
|
||||
for link in sub_links:
|
||||
await asyncio.create_task(main(link))
|
||||
await asyncio.create_task(crawl(link))
|
||||
elif 'application/javascript' in content_type:
|
||||
LINKS_TO_TRACK.add(url)
|
||||
elif 'text/css' in content_type:
|
||||
|
@ -174,8 +191,15 @@ async def main(url: str):
|
|||
logger.error('Codec can\'t decode byte. So its was a tgs file')
|
||||
|
||||
|
||||
async def start(url_list: set[str]):
|
||||
for url in url_list:
|
||||
await asyncio.create_task(crawl(url))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
asyncio.get_event_loop().run_until_complete(main(BASE_URL))
|
||||
HIDDEN_URLS.add(BASE_URL)
|
||||
|
||||
asyncio.get_event_loop().run_until_complete(start(HIDDEN_URLS))
|
||||
asyncio.get_event_loop().run_until_complete(SESSION.close())
|
||||
|
||||
with open(OUTPUT_FILENAME, 'w') as f:
|
||||
|
|
Loading…
Add table
Reference in a new issue