mirror of
https://github.com/MarshalX/telegram-crawler.git
synced 2024-10-23 17:47:21 +02:00
add tracking of all media
This commit is contained in:
parent
e217c3f5ec
commit
961320022c
3 changed files with 71 additions and 20 deletions
|
@ -79,10 +79,6 @@ resources (-s flag of apktool to disable disassembly of dex files).
|
||||||
Writing a check for the need for decompilation by the hash of the apk file
|
Writing a check for the need for decompilation by the hash of the apk file
|
||||||
would take more time.
|
would take more time.
|
||||||
|
|
||||||
### TODO list
|
|
||||||
|
|
||||||
- add storing hashes of image, svg, video.
|
|
||||||
|
|
||||||
### Example of link crawler rules configuration
|
### Example of link crawler rules configuration
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
|
|
@ -47,6 +47,10 @@ logging.basicConfig(format='%(message)s', level=logging.DEBUG)
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def get_hash(data: bytes) -> str:
|
||||||
|
return hashlib.sha256(data).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
async def download_file(url, path, session):
|
async def download_file(url, path, session):
|
||||||
async with session.get(url) as response:
|
async with session.get(url) as response:
|
||||||
if response.status != 200:
|
if response.status != 200:
|
||||||
|
@ -96,7 +100,7 @@ async def track_additional_files(
|
||||||
content = await r_file.read()
|
content = await r_file.read()
|
||||||
|
|
||||||
if save_hash_only:
|
if save_hash_only:
|
||||||
await w_file.write(hashlib.sha256(content).hexdigest())
|
await w_file.write(get_hash(content))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
content = re.sub(r'id=".*"', 'id="tgcrawl"', content)
|
content = re.sub(r'id=".*"', 'id="tgcrawl"', content)
|
||||||
|
@ -194,6 +198,23 @@ async def collect_translations_paginated_content(url: str, session: aiohttp.Clie
|
||||||
return '\n'.join(content)
|
return '\n'.join(content)
|
||||||
|
|
||||||
|
|
||||||
|
def is_hashable_only_content_type(content_type) -> bool:
|
||||||
|
hashable_only_content_types = (
|
||||||
|
'png',
|
||||||
|
'jpeg',
|
||||||
|
'x-icon',
|
||||||
|
'gif',
|
||||||
|
'mp4',
|
||||||
|
'webm',
|
||||||
|
)
|
||||||
|
|
||||||
|
for hashable_only_content_type in hashable_only_content_types:
|
||||||
|
if hashable_only_content_type in content_type:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
async def crawl(url: str, session: aiohttp.ClientSession):
|
async def crawl(url: str, session: aiohttp.ClientSession):
|
||||||
try:
|
try:
|
||||||
logger.info(f'Process {url}')
|
logger.info(f'Process {url}')
|
||||||
|
@ -210,16 +231,35 @@ async def crawl(url: str, session: aiohttp.ClientSession):
|
||||||
|
|
||||||
# bypass external slashes and so on
|
# bypass external slashes and so on
|
||||||
url_parts = [p for p in url.split('/') if p not in ILLEGAL_PATH_CHARS]
|
url_parts = [p for p in url.split('/') if p not in ILLEGAL_PATH_CHARS]
|
||||||
|
|
||||||
|
is_hashable_only = is_hashable_only_content_type(response.content_type)
|
||||||
|
# amazing dirt for media files like
|
||||||
|
# telegram.org/file/811140591/1/q7zZHjgES6s/9d121a89ffb0015837
|
||||||
|
# with response content type HTML instead of image. Shame on you
|
||||||
|
# sometimes it returns correct type. noice load balancing
|
||||||
|
is_sucking_file = '/file/' in url and 'text' in response.content_type
|
||||||
|
|
||||||
# handle pure domains and html pages without ext in url
|
# handle pure domains and html pages without ext in url
|
||||||
ext = '.html' if '.' not in url_parts[-1] or len(url_parts) == 1 else ''
|
ext = '.html' if '.' not in url_parts[-1] or len(url_parts) == 1 else ''
|
||||||
|
|
||||||
|
# I don't add ext by content type for images and so on cuz TG servers sucks.
|
||||||
|
# Some servers do not return correct content type. Some servers do...
|
||||||
|
if is_hashable_only or is_sucking_file:
|
||||||
|
ext = ''
|
||||||
|
|
||||||
filename = OUTPUT_FOLDER + '/'.join(url_parts) + ext
|
filename = OUTPUT_FOLDER + '/'.join(url_parts) + ext
|
||||||
|
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
||||||
|
|
||||||
|
if is_sucking_file or is_hashable_only:
|
||||||
|
content = await response.read()
|
||||||
|
async with aiofiles.open(filename, 'w', encoding='utf-8') as f:
|
||||||
|
await f.write(get_hash(content))
|
||||||
|
return
|
||||||
|
|
||||||
content = await response.text(encoding='UTF-8')
|
content = await response.text(encoding='UTF-8')
|
||||||
if re.search(TRANSLATIONS_EN_CATEGORY_URL_REGEX, url):
|
if re.search(TRANSLATIONS_EN_CATEGORY_URL_REGEX, url):
|
||||||
content = await collect_translations_paginated_content(url, session)
|
content = await collect_translations_paginated_content(url, session)
|
||||||
|
|
||||||
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
|
||||||
async with aiofiles.open(filename, 'w', encoding='utf-8') as f:
|
async with aiofiles.open(filename, 'w', encoding='utf-8') as f:
|
||||||
content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
|
content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
|
||||||
content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content)
|
content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content)
|
||||||
|
|
|
@ -12,7 +12,7 @@ from aiohttp import ClientConnectorError, ServerDisconnectedError
|
||||||
|
|
||||||
PROTOCOL = 'https://'
|
PROTOCOL = 'https://'
|
||||||
BASE_URL = 'telegram.org'
|
BASE_URL = 'telegram.org'
|
||||||
# its necessary to help crawler to find more links
|
# it's necessary to help crawler to find more links
|
||||||
HIDDEN_URLS = {
|
HIDDEN_URLS = {
|
||||||
'corefork.telegram.org',
|
'corefork.telegram.org',
|
||||||
'corefork.telegram.org/getProxyConfig',
|
'corefork.telegram.org/getProxyConfig',
|
||||||
|
@ -111,8 +111,7 @@ CRAWL_RULES = {
|
||||||
},
|
},
|
||||||
'telegram.org': {
|
'telegram.org': {
|
||||||
'deny': {
|
'deny': {
|
||||||
'file/',
|
r'apps$',
|
||||||
r'apps$'
|
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
'webz.telegram.org': {
|
'webz.telegram.org': {
|
||||||
|
@ -227,6 +226,28 @@ def cleanup_links(links: set[str]) -> set[str]:
|
||||||
return cleaned_links
|
return cleaned_links
|
||||||
|
|
||||||
|
|
||||||
|
def is_trackable_content_type(content_type) -> bool:
|
||||||
|
trackable_content_types = (
|
||||||
|
'javascript',
|
||||||
|
'css',
|
||||||
|
'plain',
|
||||||
|
'json',
|
||||||
|
'svg',
|
||||||
|
'png',
|
||||||
|
'jpeg',
|
||||||
|
'x-icon',
|
||||||
|
'gif',
|
||||||
|
'mp4',
|
||||||
|
'webm',
|
||||||
|
)
|
||||||
|
|
||||||
|
for trackable_content_type in trackable_content_types:
|
||||||
|
if trackable_content_type in content_type:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
async def crawl(url: str, session: aiohttp.ClientSession):
|
async def crawl(url: str, session: aiohttp.ClientSession):
|
||||||
if url in VISITED_LINKS:
|
if url in VISITED_LINKS:
|
||||||
return
|
return
|
||||||
|
@ -257,19 +278,13 @@ async def crawl(url: str, session: aiohttp.ClientSession):
|
||||||
|
|
||||||
sub_links = absolute_links | relative_links
|
sub_links = absolute_links | relative_links
|
||||||
await asyncio.gather(*[crawl(url, session) for url in sub_links])
|
await asyncio.gather(*[crawl(url, session) for url in sub_links])
|
||||||
elif 'application/javascript' in content_type:
|
elif is_trackable_content_type(content_type):
|
||||||
LINKS_TO_TRACK.add(url)
|
|
||||||
elif 'css' in content_type:
|
|
||||||
LINKS_TO_TRACK.add(url)
|
|
||||||
elif 'plain' in content_type:
|
|
||||||
LINKS_TO_TRACK.add(url)
|
|
||||||
elif 'application/json' in content_type:
|
|
||||||
LINKS_TO_TRACK.add(url)
|
LINKS_TO_TRACK.add(url)
|
||||||
else:
|
else:
|
||||||
# TODO track hashes of image/svg/video content types
|
# for example, zip with update of macOS client
|
||||||
logger.info(f'Unhandled type: {content_type}')
|
logger.info(f'Unhandled type: {content_type} from {url}')
|
||||||
|
|
||||||
# telegram url can work with and without trailing slash (no redirect). P.S. not on every sub domain ;d
|
# telegram url can work with and without trailing slash (no redirect). P.S. not on every subdomain ;d
|
||||||
# so this is a problem when we have random behavior with link will be added
|
# so this is a problem when we have random behavior with link will be added
|
||||||
# this if resolve this issue. If available both link we prefer without trailing slash
|
# this if resolve this issue. If available both link we prefer without trailing slash
|
||||||
without_trailing_slash = url[:-1:] if url.endswith('/') else url
|
without_trailing_slash = url[:-1:] if url.endswith('/') else url
|
||||||
|
@ -277,7 +292,7 @@ async def crawl(url: str, session: aiohttp.ClientSession):
|
||||||
f'{without_trailing_slash}/' in LINKS_TO_TRACK:
|
f'{without_trailing_slash}/' in LINKS_TO_TRACK:
|
||||||
LINKS_TO_TRACK.remove(f'{without_trailing_slash}/')
|
LINKS_TO_TRACK.remove(f'{without_trailing_slash}/')
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
logger.warning('Codec can\'t decode byte. So its was a tgs file')
|
logger.warning(f'Codec can\'t decode bytes. So it was a tgs file or response with broken content type {url}')
|
||||||
except ClientConnectorError:
|
except ClientConnectorError:
|
||||||
logger.warning(f'Wrong link: {url}')
|
logger.warning(f'Wrong link: {url}')
|
||||||
except (ServerDisconnectedError, TimeoutError):
|
except (ServerDisconnectedError, TimeoutError):
|
||||||
|
|
Loading…
Reference in a new issue