add tracking of all media

This commit is contained in:
Il'ya (Marshal) 2022-04-10 17:46:36 +02:00
parent e217c3f5ec
commit 961320022c
3 changed files with 71 additions and 20 deletions

View file

@ -79,10 +79,6 @@ resources (-s flag of apktool to disable disassembly of dex files).
Writing a check for the need for decompilation by the hash of the apk file
would take more time.
### TODO list
- add storing hashes of image, svg, video.
### Example of link crawler rules configuration
```python

View file

@ -47,6 +47,10 @@ logging.basicConfig(format='%(message)s', level=logging.DEBUG)
logger = logging.getLogger(__name__)
def get_hash(data: bytes) -> str:
return hashlib.sha256(data).hexdigest()
async def download_file(url, path, session):
async with session.get(url) as response:
if response.status != 200:
@ -96,7 +100,7 @@ async def track_additional_files(
content = await r_file.read()
if save_hash_only:
await w_file.write(hashlib.sha256(content).hexdigest())
await w_file.write(get_hash(content))
continue
content = re.sub(r'id=".*"', 'id="tgcrawl"', content)
@ -194,6 +198,23 @@ async def collect_translations_paginated_content(url: str, session: aiohttp.Clie
return '\n'.join(content)
def is_hashable_only_content_type(content_type) -> bool:
hashable_only_content_types = (
'png',
'jpeg',
'x-icon',
'gif',
'mp4',
'webm',
)
for hashable_only_content_type in hashable_only_content_types:
if hashable_only_content_type in content_type:
return True
return False
async def crawl(url: str, session: aiohttp.ClientSession):
try:
logger.info(f'Process {url}')
@ -210,16 +231,35 @@ async def crawl(url: str, session: aiohttp.ClientSession):
# bypass external slashes and so on
url_parts = [p for p in url.split('/') if p not in ILLEGAL_PATH_CHARS]
is_hashable_only = is_hashable_only_content_type(response.content_type)
# amazing dirt for media files like
# telegram.org/file/811140591/1/q7zZHjgES6s/9d121a89ffb0015837
# with response content type HTML instead of image. Shame on you
# sometimes it returns correct type. noice load balancing
is_sucking_file = '/file/' in url and 'text' in response.content_type
# handle pure domains and html pages without ext in url
ext = '.html' if '.' not in url_parts[-1] or len(url_parts) == 1 else ''
# I don't add ext by content type for images and so on cuz TG servers sucks.
# Some servers do not return correct content type. Some servers do...
if is_hashable_only or is_sucking_file:
ext = ''
filename = OUTPUT_FOLDER + '/'.join(url_parts) + ext
os.makedirs(os.path.dirname(filename), exist_ok=True)
if is_sucking_file or is_hashable_only:
content = await response.read()
async with aiofiles.open(filename, 'w', encoding='utf-8') as f:
await f.write(get_hash(content))
return
content = await response.text(encoding='UTF-8')
if re.search(TRANSLATIONS_EN_CATEGORY_URL_REGEX, url):
content = await collect_translations_paginated_content(url, session)
os.makedirs(os.path.dirname(filename), exist_ok=True)
async with aiofiles.open(filename, 'w', encoding='utf-8') as f:
content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content)

View file

@ -12,7 +12,7 @@ from aiohttp import ClientConnectorError, ServerDisconnectedError
PROTOCOL = 'https://'
BASE_URL = 'telegram.org'
# its necessary to help crawler to find more links
# it's necessary to help crawler to find more links
HIDDEN_URLS = {
'corefork.telegram.org',
'corefork.telegram.org/getProxyConfig',
@ -111,8 +111,7 @@ CRAWL_RULES = {
},
'telegram.org': {
'deny': {
'file/',
r'apps$'
r'apps$',
},
},
'webz.telegram.org': {
@ -227,6 +226,28 @@ def cleanup_links(links: set[str]) -> set[str]:
return cleaned_links
def is_trackable_content_type(content_type) -> bool:
trackable_content_types = (
'javascript',
'css',
'plain',
'json',
'svg',
'png',
'jpeg',
'x-icon',
'gif',
'mp4',
'webm',
)
for trackable_content_type in trackable_content_types:
if trackable_content_type in content_type:
return True
return False
async def crawl(url: str, session: aiohttp.ClientSession):
if url in VISITED_LINKS:
return
@ -257,19 +278,13 @@ async def crawl(url: str, session: aiohttp.ClientSession):
sub_links = absolute_links | relative_links
await asyncio.gather(*[crawl(url, session) for url in sub_links])
elif 'application/javascript' in content_type:
LINKS_TO_TRACK.add(url)
elif 'css' in content_type:
LINKS_TO_TRACK.add(url)
elif 'plain' in content_type:
LINKS_TO_TRACK.add(url)
elif 'application/json' in content_type:
elif is_trackable_content_type(content_type):
LINKS_TO_TRACK.add(url)
else:
# TODO track hashes of image/svg/video content types
logger.info(f'Unhandled type: {content_type}')
# for example, zip with update of macOS client
logger.info(f'Unhandled type: {content_type} from {url}')
# telegram url can work with and without trailing slash (no redirect). P.S. not on every sub domain ;d
# telegram url can work with and without trailing slash (no redirect). P.S. not on every subdomain ;d
# so this is a problem when we have random behavior with link will be added
# this if resolve this issue. If available both link we prefer without trailing slash
without_trailing_slash = url[:-1:] if url.endswith('/') else url
@ -277,7 +292,7 @@ async def crawl(url: str, session: aiohttp.ClientSession):
f'{without_trailing_slash}/' in LINKS_TO_TRACK:
LINKS_TO_TRACK.remove(f'{without_trailing_slash}/')
except UnicodeDecodeError:
logger.warning('Codec can\'t decode byte. So its was a tgs file')
logger.warning(f'Codec can\'t decode bytes. So it was a tgs file or response with broken content type {url}')
except ClientConnectorError:
logger.warning(f'Wrong link: {url}')
except (ServerDisconnectedError, TimeoutError):