mirror of
https://github.com/MarshalX/telegram-crawler.git
synced 2024-11-27 01:42:18 +01:00
add 304 as success response;
add handling of xml; add sparkle file of macOS client; fix request retrying with 500 status code; fix encoding.
This commit is contained in:
parent
6f3d2bce6d
commit
344e550bd9
2 changed files with 9 additions and 6 deletions
|
@ -43,7 +43,7 @@ async def crawl(url: str, session: aiohttp.ClientSession):
|
|||
if response.status == 500:
|
||||
return await asyncio.gather(crawl(url, session))
|
||||
|
||||
if response.status != 200:
|
||||
if response.status not in {200, 304}:
|
||||
if response.status != 302:
|
||||
content = await response.text()
|
||||
logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}')
|
||||
|
@ -58,7 +58,7 @@ async def crawl(url: str, session: aiohttp.ClientSession):
|
|||
|
||||
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
||||
async with aiofiles.open(filename, 'w') as f:
|
||||
content = await response.text()
|
||||
content = await response.text(encoding='UTF-8')
|
||||
content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
|
||||
content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content)
|
||||
content = re.sub(PASSPORT_SSID_REGEX, PASSPORT_SSID_TEMPLATE, content)
|
||||
|
|
|
@ -24,6 +24,8 @@ HIDDEN_URLS = {
|
|||
|
||||
'desktop.telegram.org/changelog',
|
||||
|
||||
'osx.telegram.org/updates/versions.xml',
|
||||
|
||||
'instantview.telegram.org/rules',
|
||||
|
||||
'core.telegram.org/resources/cidr.txt',
|
||||
|
@ -222,18 +224,19 @@ async def crawl(url: str, session: aiohttp.ClientSession):
|
|||
content_type = response.headers.get('content-type')
|
||||
|
||||
if response.status == 500:
|
||||
VISITED_LINKS.remove(url)
|
||||
return await asyncio.gather(crawl(url, session))
|
||||
|
||||
if response.status != 200:
|
||||
if response.status not in {200, 304}:
|
||||
if response.status != 302:
|
||||
content = await response.text()
|
||||
content = await response.text(encoding='UTF-8')
|
||||
logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}')
|
||||
return
|
||||
|
||||
if 'text/html' in content_type:
|
||||
if 'text' in content_type:
|
||||
LINKS_TO_TRACK.add(url)
|
||||
|
||||
html = await response.text()
|
||||
html = await response.text(encoding='UTF-8')
|
||||
absolute_links = cleanup_links(find_absolute_links(html))
|
||||
relative_links = cleanup_links(find_relative_links(html, url))
|
||||
|
||||
|
|
Loading…
Reference in a new issue