mirror of
https://github.com/MarshalX/telegram-crawler.git
synced 2024-11-21 23:06:40 +01:00
add tracking of all media
This commit is contained in:
parent
e217c3f5ec
commit
961320022c
3 changed files with 71 additions and 20 deletions
|
@ -79,10 +79,6 @@ resources (-s flag of apktool to disable disassembly of dex files).
|
|||
Writing a check for the need for decompilation by the hash of the apk file
|
||||
would take more time.
|
||||
|
||||
### TODO list
|
||||
|
||||
- add storing hashes of image, svg, video.
|
||||
|
||||
### Example of link crawler rules configuration
|
||||
|
||||
```python
|
||||
|
|
|
@ -47,6 +47,10 @@ logging.basicConfig(format='%(message)s', level=logging.DEBUG)
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_hash(data: bytes) -> str:
|
||||
return hashlib.sha256(data).hexdigest()
|
||||
|
||||
|
||||
async def download_file(url, path, session):
|
||||
async with session.get(url) as response:
|
||||
if response.status != 200:
|
||||
|
@ -96,7 +100,7 @@ async def track_additional_files(
|
|||
content = await r_file.read()
|
||||
|
||||
if save_hash_only:
|
||||
await w_file.write(hashlib.sha256(content).hexdigest())
|
||||
await w_file.write(get_hash(content))
|
||||
continue
|
||||
|
||||
content = re.sub(r'id=".*"', 'id="tgcrawl"', content)
|
||||
|
@ -194,6 +198,23 @@ async def collect_translations_paginated_content(url: str, session: aiohttp.Clie
|
|||
return '\n'.join(content)
|
||||
|
||||
|
||||
def is_hashable_only_content_type(content_type) -> bool:
|
||||
hashable_only_content_types = (
|
||||
'png',
|
||||
'jpeg',
|
||||
'x-icon',
|
||||
'gif',
|
||||
'mp4',
|
||||
'webm',
|
||||
)
|
||||
|
||||
for hashable_only_content_type in hashable_only_content_types:
|
||||
if hashable_only_content_type in content_type:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
async def crawl(url: str, session: aiohttp.ClientSession):
|
||||
try:
|
||||
logger.info(f'Process {url}')
|
||||
|
@ -210,16 +231,35 @@ async def crawl(url: str, session: aiohttp.ClientSession):
|
|||
|
||||
# bypass external slashes and so on
|
||||
url_parts = [p for p in url.split('/') if p not in ILLEGAL_PATH_CHARS]
|
||||
|
||||
is_hashable_only = is_hashable_only_content_type(response.content_type)
|
||||
# amazing dirt for media files like
|
||||
# telegram.org/file/811140591/1/q7zZHjgES6s/9d121a89ffb0015837
|
||||
# with response content type HTML instead of image. Shame on you
|
||||
# sometimes it returns correct type. noice load balancing
|
||||
is_sucking_file = '/file/' in url and 'text' in response.content_type
|
||||
|
||||
# handle pure domains and html pages without ext in url
|
||||
ext = '.html' if '.' not in url_parts[-1] or len(url_parts) == 1 else ''
|
||||
|
||||
# I don't add ext by content type for images and so on cuz TG servers sucks.
|
||||
# Some servers do not return correct content type. Some servers do...
|
||||
if is_hashable_only or is_sucking_file:
|
||||
ext = ''
|
||||
|
||||
filename = OUTPUT_FOLDER + '/'.join(url_parts) + ext
|
||||
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
||||
|
||||
if is_sucking_file or is_hashable_only:
|
||||
content = await response.read()
|
||||
async with aiofiles.open(filename, 'w', encoding='utf-8') as f:
|
||||
await f.write(get_hash(content))
|
||||
return
|
||||
|
||||
content = await response.text(encoding='UTF-8')
|
||||
if re.search(TRANSLATIONS_EN_CATEGORY_URL_REGEX, url):
|
||||
content = await collect_translations_paginated_content(url, session)
|
||||
|
||||
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
||||
async with aiofiles.open(filename, 'w', encoding='utf-8') as f:
|
||||
content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
|
||||
content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content)
|
||||
|
|
|
@ -12,7 +12,7 @@ from aiohttp import ClientConnectorError, ServerDisconnectedError
|
|||
|
||||
PROTOCOL = 'https://'
|
||||
BASE_URL = 'telegram.org'
|
||||
# its necessary to help crawler to find more links
|
||||
# it's necessary to help crawler to find more links
|
||||
HIDDEN_URLS = {
|
||||
'corefork.telegram.org',
|
||||
'corefork.telegram.org/getProxyConfig',
|
||||
|
@ -111,8 +111,7 @@ CRAWL_RULES = {
|
|||
},
|
||||
'telegram.org': {
|
||||
'deny': {
|
||||
'file/',
|
||||
r'apps$'
|
||||
r'apps$',
|
||||
},
|
||||
},
|
||||
'webz.telegram.org': {
|
||||
|
@ -227,6 +226,28 @@ def cleanup_links(links: set[str]) -> set[str]:
|
|||
return cleaned_links
|
||||
|
||||
|
||||
def is_trackable_content_type(content_type) -> bool:
|
||||
trackable_content_types = (
|
||||
'javascript',
|
||||
'css',
|
||||
'plain',
|
||||
'json',
|
||||
'svg',
|
||||
'png',
|
||||
'jpeg',
|
||||
'x-icon',
|
||||
'gif',
|
||||
'mp4',
|
||||
'webm',
|
||||
)
|
||||
|
||||
for trackable_content_type in trackable_content_types:
|
||||
if trackable_content_type in content_type:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
async def crawl(url: str, session: aiohttp.ClientSession):
|
||||
if url in VISITED_LINKS:
|
||||
return
|
||||
|
@ -257,19 +278,13 @@ async def crawl(url: str, session: aiohttp.ClientSession):
|
|||
|
||||
sub_links = absolute_links | relative_links
|
||||
await asyncio.gather(*[crawl(url, session) for url in sub_links])
|
||||
elif 'application/javascript' in content_type:
|
||||
LINKS_TO_TRACK.add(url)
|
||||
elif 'css' in content_type:
|
||||
LINKS_TO_TRACK.add(url)
|
||||
elif 'plain' in content_type:
|
||||
LINKS_TO_TRACK.add(url)
|
||||
elif 'application/json' in content_type:
|
||||
elif is_trackable_content_type(content_type):
|
||||
LINKS_TO_TRACK.add(url)
|
||||
else:
|
||||
# TODO track hashes of image/svg/video content types
|
||||
logger.info(f'Unhandled type: {content_type}')
|
||||
# for example, zip with update of macOS client
|
||||
logger.info(f'Unhandled type: {content_type} from {url}')
|
||||
|
||||
# telegram url can work with and without trailing slash (no redirect). P.S. not on every sub domain ;d
|
||||
# telegram url can work with and without trailing slash (no redirect). P.S. not on every subdomain ;d
|
||||
# so this is a problem when we have random behavior with link will be added
|
||||
# this if resolve this issue. If available both link we prefer without trailing slash
|
||||
without_trailing_slash = url[:-1:] if url.endswith('/') else url
|
||||
|
@ -277,7 +292,7 @@ async def crawl(url: str, session: aiohttp.ClientSession):
|
|||
f'{without_trailing_slash}/' in LINKS_TO_TRACK:
|
||||
LINKS_TO_TRACK.remove(f'{without_trailing_slash}/')
|
||||
except UnicodeDecodeError:
|
||||
logger.warning('Codec can\'t decode byte. So its was a tgs file')
|
||||
logger.warning(f'Codec can\'t decode bytes. So it was a tgs file or response with broken content type {url}')
|
||||
except ClientConnectorError:
|
||||
logger.warning(f'Wrong link: {url}')
|
||||
except (ServerDisconnectedError, TimeoutError):
|
||||
|
|
Loading…
Reference in a new issue