From b7234090fb97632c8f82ac55bd1bf9834fab3b5b Mon Sep 17 00:00:00 2001 From: "Il'ya (Marshal)" Date: Wed, 29 Jun 2022 11:04:34 +0200 Subject: [PATCH] fix list comparing; add logs with set operations; remove version from GIF signature; change order of adding operations. --- make_tracked_links_list.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/make_tracked_links_list.py b/make_tracked_links_list.py index 1cc9ed0b73..6a51b41426 100644 --- a/make_tracked_links_list.py +++ b/make_tracked_links_list.py @@ -364,14 +364,17 @@ async def _crawl(url: str, session: aiohttp.ClientSession): return if is_textable_content_type(content_type): - if is_translation_url(url): - LINKS_TO_TRANSLATIONS.add(url) - else: - LINKS_TO_TRACK.add(url) - # raw content will be cached by aiohttp. Don't worry about it raw_content = await response.read() content = await response.text(encoding='UTF-8') + + if is_translation_url(url): + LINKS_TO_TRANSLATIONS.add(url) + logger.info(f'add {url} to LINKS_TO_TRANSLATIONS') + else: + LINKS_TO_TRACK.add(url) + logger.info(f'add {url} to LINKS_TO_TRACK') + absolute_links = cleanup_links(find_absolute_links(content)) relative_links_finder = find_relative_links @@ -384,6 +387,7 @@ async def _crawl(url: str, session: aiohttp.ClientSession): await asyncio.gather(*[crawl(url, session) for url in sub_links]) elif is_trackable_content_type(content_type): LINKS_TO_TRACKABLE_RESOURCES.add(url) + logger.info(f'add {url} to LINKS_TO_TRACKABLE_RESOURCES') else: # for example, zip with update of macOS client logger.info(f'Unhandled type: {content_type} from {url}') @@ -395,12 +399,13 @@ async def _crawl(url: str, session: aiohttp.ClientSession): without_trailing_slash = url[:-1:] if url.endswith('/') else url if without_trailing_slash in links_set and f'{without_trailing_slash}/' in links_set: links_set.remove(f'{without_trailing_slash}/') + logger.info(f'remove {without_trailing_slash}/') except UnicodeDecodeError: logger.warning(f'Codec can\'t decode bytes. So it was a tgs file or response with broken content type {url}') - if raw_content.startswith(b'GIF89a'): - LINKS_TO_TRACK.remove(url) + if raw_content.startswith(b'GIF'): LINKS_TO_TRACKABLE_RESOURCES.add(url) + logger.info(f'add {url} to LINKS_TO_TRACKABLE_RESOURCES (raw content)') async def start(url_list: set[str]): @@ -419,7 +424,7 @@ if __name__ == '__main__': try: OLD_URL_LIST = set() - for filename in (OUTPUT_FILENAME, OUTPUT_RESOURCES_FILENAME): + for filename in (OUTPUT_FILENAME, OUTPUT_RESOURCES_FILENAME, OUTPUT_TRANSLATIONS_FILENAME): with open(filename, 'r') as f: OLD_URL_LIST |= set([l.replace('\n', '') for l in f.readlines()])