add 304 as success response;

add handling of xml;
add sparkle file of macOS client;
fix request retrying with 500 status code;
fix encoding.
This commit is contained in:
Il'ya (Marshal) 2021-12-30 14:22:01 +01:00
parent 6f3d2bce6d
commit 344e550bd9
2 changed files with 9 additions and 6 deletions

View file

@ -43,7 +43,7 @@ async def crawl(url: str, session: aiohttp.ClientSession):
if response.status == 500:
return await asyncio.gather(crawl(url, session))
if response.status != 200:
if response.status not in {200, 304}:
if response.status != 302:
content = await response.text()
logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}')
@ -58,7 +58,7 @@ async def crawl(url: str, session: aiohttp.ClientSession):
os.makedirs(os.path.dirname(filename), exist_ok=True)
async with aiofiles.open(filename, 'w') as f:
content = await response.text()
content = await response.text(encoding='UTF-8')
content = re.sub(PAGE_GENERATION_TIME_REGEX, '', content)
content = re.sub(PAGE_API_HASH_REGEX, PAGE_API_HASH_TEMPLATE, content)
content = re.sub(PASSPORT_SSID_REGEX, PASSPORT_SSID_TEMPLATE, content)

View file

@ -24,6 +24,8 @@ HIDDEN_URLS = {
'desktop.telegram.org/changelog',
'osx.telegram.org/updates/versions.xml',
'instantview.telegram.org/rules',
'core.telegram.org/resources/cidr.txt',
@ -222,18 +224,19 @@ async def crawl(url: str, session: aiohttp.ClientSession):
content_type = response.headers.get('content-type')
if response.status == 500:
VISITED_LINKS.remove(url)
return await asyncio.gather(crawl(url, session))
if response.status != 200:
if response.status not in {200, 304}:
if response.status != 302:
content = await response.text()
content = await response.text(encoding='UTF-8')
logger.debug(f'Skip {url} because status code == {response.status}. Content: {content}')
return
if 'text/html' in content_type:
if 'text' in content_type:
LINKS_TO_TRACK.add(url)
html = await response.text()
html = await response.text(encoding='UTF-8')
absolute_links = cleanup_links(find_absolute_links(html))
relative_links = cleanup_links(find_relative_links(html, url))