mirror of
https://github.com/MarshalX/telegram-crawler.git
synced 2024-11-22 07:19:34 +01:00
rewrite logic of rules system (not its uses regex);
fix trailing slash; many attempt to send request;
This commit is contained in:
parent
b2afbd72fc
commit
cd5653605f
1 changed files with 68 additions and 59 deletions
|
@ -7,65 +7,59 @@ from time import time
|
||||||
from urllib.parse import unquote
|
from urllib.parse import unquote
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
|
from aiohttp import ClientConnectorError
|
||||||
|
|
||||||
PROTOCOL = 'https://'
|
PROTOCOL = 'https://'
|
||||||
BASE_URL = 'telegram.org'
|
BASE_URL = 'telegram.org'
|
||||||
# its necessary to help crawler to find more links
|
# its necessary to help crawler to find more links
|
||||||
HIDDEN_URLS = {
|
HIDDEN_URLS = {
|
||||||
'corefork.telegram.org',
|
# 'corefork.telegram.org',
|
||||||
|
|
||||||
'telegram.org/privacy/gmailbot',
|
'telegram.org/privacy/gmailbot',
|
||||||
'telegram.org/tos',
|
'telegram.org/tos',
|
||||||
'telegram.org/tour',
|
'telegram.org/tour',
|
||||||
|
|
||||||
'translations.telegram.org',
|
'desktop.telegram.org/changelog',
|
||||||
'translations.telegram.org/en/android',
|
|
||||||
'translations.telegram.org/en/ios',
|
|
||||||
'translations.telegram.org/en/tdesktop',
|
|
||||||
'translations.telegram.org/en/macos',
|
|
||||||
'translations.telegram.org/en/android_x',
|
|
||||||
}
|
}
|
||||||
BASE_URL_REGEX = r'telegram.org'
|
BASE_URL_REGEX = r'telegram.org'
|
||||||
|
|
||||||
# disable crawling sub links for specific domains and url patches
|
# disable crawling sub links for specific domains and url patterns
|
||||||
EXCLUDE_RULES = {
|
CRAWL_RULES = {
|
||||||
# '*' means exclude all
|
# every rule is regex
|
||||||
|
# empty string means match any url
|
||||||
|
# allow rules with high priority than deny
|
||||||
'translations.telegram.org': {
|
'translations.telegram.org': {
|
||||||
# 'max_count_of_slashes': 3,
|
'allow': {
|
||||||
'patches': {
|
r'^[^/]*$', # root
|
||||||
'*',
|
r'org/[^/]*/$', # 1 lvl sub
|
||||||
|
r'/en/[a-z_]+/$' # 1 lvl after /en/
|
||||||
|
},
|
||||||
|
'deny': {
|
||||||
|
'', # all
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
'bugs.telegram.org': {
|
'bugs.telegram.org': { # crawl first page of cards sorted by rating
|
||||||
'patches': {
|
'deny': {
|
||||||
'c/',
|
r'/c/[0-9]+/[0-9]+', # disable comments
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
'instantview.telegram.org': {
|
'instantview.telegram.org': {
|
||||||
'patches': {
|
'allow': {
|
||||||
|
'contest/winners'
|
||||||
|
},
|
||||||
|
'deny': {
|
||||||
'file/',
|
'file/',
|
||||||
|
|
||||||
'templates/',
|
r'templates/.+',
|
||||||
'samples/',
|
'samples/',
|
||||||
'contest/',
|
'contest/',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
'corefork.telegram.org': {
|
|
||||||
'patches': {
|
|
||||||
'file/',
|
|
||||||
|
|
||||||
'tdlib/docs/',
|
|
||||||
|
|
||||||
'constructor/',
|
|
||||||
'method/',
|
|
||||||
'type/',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
'core.telegram.org': {
|
'core.telegram.org': {
|
||||||
'patches': {
|
'deny': {
|
||||||
'file/',
|
'file/',
|
||||||
|
|
||||||
'tdlib/docs/',
|
'tdlib/docs/classtd',
|
||||||
|
|
||||||
'constructor/',
|
'constructor/',
|
||||||
'method/',
|
'method/',
|
||||||
|
@ -73,7 +67,7 @@ EXCLUDE_RULES = {
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
'telegram.org': {
|
'telegram.org': {
|
||||||
'patches': {
|
'deny': {
|
||||||
'file/',
|
'file/',
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -89,6 +83,7 @@ OUTPUT_FILENAME = os.environ.get('OUTPUT_FILENAME', 'tracked_links.txt')
|
||||||
|
|
||||||
# unsecure but so simple
|
# unsecure but so simple
|
||||||
CONNECTOR = aiohttp.TCPConnector(ssl=False)
|
CONNECTOR = aiohttp.TCPConnector(ssl=False)
|
||||||
|
TIMEOUT = aiohttp.ClientTimeout(total=30)
|
||||||
|
|
||||||
logging.basicConfig(format='%(message)s', level=logging.DEBUG)
|
logging.basicConfig(format='%(message)s', level=logging.DEBUG)
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
@ -97,26 +92,29 @@ VISITED_LINKS = set()
|
||||||
LINKS_TO_TRACK = set()
|
LINKS_TO_TRACK = set()
|
||||||
|
|
||||||
|
|
||||||
def should_exclude(url: str, direct_link=None) -> bool:
|
def should_exclude(url: str) -> bool:
|
||||||
if not direct_link:
|
|
||||||
direct_link = re.findall(DIRECT_LINK_REGEX, url)[0]
|
direct_link = re.findall(DIRECT_LINK_REGEX, url)[0]
|
||||||
domain_exclude_rules = EXCLUDE_RULES.get(direct_link, dict())
|
domain_rules = CRAWL_RULES.get(direct_link)
|
||||||
|
if not domain_rules:
|
||||||
max_count_of_slashes = domain_exclude_rules.get('max_count_of_slashes')
|
|
||||||
exclude_patches = domain_exclude_rules.get('patches', set())
|
|
||||||
|
|
||||||
if '*' in exclude_patches:
|
|
||||||
return True
|
|
||||||
|
|
||||||
if max_count_of_slashes and max_count_of_slashes < url.count('/'):
|
|
||||||
return True
|
|
||||||
|
|
||||||
for path in exclude_patches:
|
|
||||||
if path in url:
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
allow_rules = domain_rules.get('allow', set())
|
||||||
|
deny_rules = domain_rules.get('deny', set())
|
||||||
|
|
||||||
|
exclude = False
|
||||||
|
|
||||||
|
for regex in deny_rules:
|
||||||
|
if re.search(regex, url):
|
||||||
|
exclude = True
|
||||||
|
break
|
||||||
|
|
||||||
|
for regex in allow_rules:
|
||||||
|
if re.search(regex, url):
|
||||||
|
exclude = False
|
||||||
|
break
|
||||||
|
|
||||||
|
return exclude
|
||||||
|
|
||||||
|
|
||||||
def find_absolute_links(html: str) -> set[str]:
|
def find_absolute_links(html: str) -> set[str]:
|
||||||
absolute_links = set(re.findall(ABSOLUTE_LINK_REGEX, html))
|
absolute_links = set(re.findall(ABSOLUTE_LINK_REGEX, html))
|
||||||
|
@ -136,17 +134,16 @@ def find_relative_links(html: str, cur_link: str) -> set[str]:
|
||||||
links = re.findall(regex, html)
|
links = re.findall(regex, html)
|
||||||
|
|
||||||
for link in links:
|
for link in links:
|
||||||
if should_exclude(link, direct_cur_link):
|
|
||||||
continue
|
|
||||||
|
|
||||||
if link.startswith('/'):
|
|
||||||
# bypass //www.apple and etc shit ;d
|
# bypass //www.apple and etc shit ;d
|
||||||
if find_absolute_links(link):
|
if link.startswith('/'):
|
||||||
# absolute links starting with double slash
|
# absolute links starting with double slash
|
||||||
if not should_exclude(link):
|
if find_absolute_links(link):
|
||||||
|
if not should_exclude(link[1::]):
|
||||||
relative_links.add(link[1::])
|
relative_links.add(link[1::])
|
||||||
else:
|
else:
|
||||||
relative_links.add(f'{direct_cur_link}/{link}')
|
url = f'{direct_cur_link}/{link}'
|
||||||
|
if not should_exclude(url):
|
||||||
|
relative_links.add(url)
|
||||||
|
|
||||||
return relative_links
|
return relative_links
|
||||||
|
|
||||||
|
@ -173,11 +170,10 @@ def cleanup_links(links: set[str]) -> set[str]:
|
||||||
|
|
||||||
|
|
||||||
async def crawl(url: str, session: aiohttp.ClientSession):
|
async def crawl(url: str, session: aiohttp.ClientSession):
|
||||||
if url.endswith('/'):
|
without_trailing_slash = url[:-1:] if url.endswith('/') else url
|
||||||
url = url[:-1:]
|
if without_trailing_slash in VISITED_LINKS:
|
||||||
if url in VISITED_LINKS or '"' in url:
|
|
||||||
return
|
return
|
||||||
VISITED_LINKS.add(url)
|
VISITED_LINKS.add(without_trailing_slash)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
logger.info(f'[{len(VISITED_LINKS)}] Process {url}')
|
logger.info(f'[{len(VISITED_LINKS)}] Process {url}')
|
||||||
|
@ -185,6 +181,17 @@ async def crawl(url: str, session: aiohttp.ClientSession):
|
||||||
status_code = response.status
|
status_code = response.status
|
||||||
content_type = response.headers.get('content-type')
|
content_type = response.headers.get('content-type')
|
||||||
|
|
||||||
|
# if it was redirect to link with trailing slash - handle this url
|
||||||
|
if 300 < status_code < 400:
|
||||||
|
location = response.headers.get('location', '')
|
||||||
|
# todo rewrite logic
|
||||||
|
if without_trailing_slash in location:
|
||||||
|
if not should_exclude(location):
|
||||||
|
# nice shit bro
|
||||||
|
logger.info(f'Trailing slash. {location}')
|
||||||
|
cleaned_link = list(cleanup_links({location}))[0]
|
||||||
|
await asyncio.gather(crawl(cleaned_link, session))
|
||||||
|
|
||||||
if status_code != 200:
|
if status_code != 200:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -206,8 +213,10 @@ async def crawl(url: str, session: aiohttp.ClientSession):
|
||||||
else:
|
else:
|
||||||
# TODO track hashes of image/svg/video content types
|
# TODO track hashes of image/svg/video content types
|
||||||
logger.info(f'Unhandled type: {content_type}')
|
logger.info(f'Unhandled type: {content_type}')
|
||||||
except:
|
except UnicodeDecodeError:
|
||||||
logger.warning('Mb codec can\'t decode byte. So its was a tgs file')
|
logger.warning('Codec can\'t decode byte. So its was a tgs file')
|
||||||
|
except ClientConnectorError:
|
||||||
|
await asyncio.gather(crawl(url, session))
|
||||||
|
|
||||||
|
|
||||||
async def start(url_list: set[str]):
|
async def start(url_list: set[str]):
|
||||||
|
|
Loading…
Reference in a new issue