Guess the decompressed file size for downloaded Zip/Tar archives

For some archives, the Itch API returns the expected decompressed file
size, but serves compressed archives with its content. Iterate over Zip
and Tar archives to get the expected decompressed file size. In case
there's anything wrong with the archives, just quietly ignore these.

Fixes #21
This commit is contained in:
Ryszard Knop 2024-11-08 21:54:55 +01:00
parent 00c3f79dbb
commit 06f75d4996

View File

@ -3,6 +3,8 @@ import json
import re import re
import logging import logging
import urllib.parse import urllib.parse
import zipfile
import tarfile
from typing import List, Dict, TypedDict, Optional, Union from typing import List, Dict, TypedDict, Optional, Union
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -219,6 +221,34 @@ class GameDownloader:
"""Performs a request to download a given upload by its ID.""" """Performs a request to download a given upload by its ID."""
return self.download_file(f"/uploads/{upload_id}/download", download_path, credentials) return self.download_file(f"/uploads/{upload_id}/download", download_path, credentials)
@staticmethod
def get_decompressed_content_size(target_path) -> None | int:
"""For some files, Itch API returns the decompressed file size, but serves
compressed downloads. Try to figure out the decompressed size. It may be
a single file in the root, or a container + files in it."""
if zipfile.is_zipfile(target_path):
try:
with zipfile.ZipFile(target_path) as f:
# Zip files contain either directories or files. The file format
# is compression-aware, compress_size is packed, file_size is unpacked.
file_infos = [i for i in f.infolist() if not i.is_dir()]
return None if len(file_infos) == 0 else sum(i.file_size for i in file_infos)
except zipfile.BadZipFile:
return None
if tarfile.is_tarfile(target_path):
try:
with tarfile.open(target_path) as f:
# Tar files can contain any Unix "file", so regular files,
# directories, symlinks, devices and FIFOs are fair game...
# On the other hand, TAR is not compression-aware.
file_infos = [i for i in f.getmembers() if i.isfile()]
return None if len(file_infos) == 0 else sum(i.size for i in file_infos)
except tarfile.TarError:
return None
return None
def download(self, url: str, skip_downloaded: bool = True): def download(self, url: str, skip_downloaded: bool = True):
match = re.match(ITCH_GAME_URL_REGEX, url) match = re.match(ITCH_GAME_URL_REGEX, url)
if not match: if not match:
@ -274,14 +304,14 @@ class GameDownloader:
upload_id = upload["id"] upload_id = upload["id"]
file_name = upload["filename"] file_name = upload["filename"]
file_size = upload.get("size") expected_size = upload.get("size")
upload_is_external = upload["storage"] == "external" upload_is_external = upload["storage"] == "external"
logging.debug( logging.debug(
"Downloading '%s' (%d), %s", "Downloading '%s' (%d), %s",
file_name, file_name,
upload_id, upload_id,
f"{file_size} bytes" if file_size is not None else "unknown size", f"{expected_size} bytes" if expected_size is not None else "unknown size",
) )
target_path = None if upload_is_external else os.path.join(paths["files"], file_name) target_path = None if upload_is_external else os.path.join(paths["files"], file_name)
@ -295,13 +325,24 @@ class GameDownloader:
if upload_is_external: if upload_is_external:
logging.debug("Found external download URL for %s: %s", target_url) logging.debug("Found external download URL for %s: %s", target_url)
external_urls.append(target_url) external_urls.append(target_url)
continue
try: try:
downloaded_file_size = os.stat(target_path).st_size downloaded_file_stat = os.stat(target_path)
if target_path is not None and file_size is not None and downloaded_file_size != file_size:
errors.append(f"File size is {downloaded_file_size}, expected {file_size} for upload {upload}")
except FileNotFoundError: except FileNotFoundError:
errors.append(f"Downloaded file not found for upload {upload}") errors.append(f"Downloaded file not found for upload {upload}")
continue
downloaded_size = downloaded_file_stat.st_size
content_size = self.get_decompressed_content_size(target_path)
print("expected", expected_size, "downloaded", downloaded_size, "content", content_size)
if (
all(x is not None for x in (target_path, expected_size, downloaded_size))
and downloaded_size != expected_size
and content_size != expected_size
):
errors.append(f"Downloaded file size is {downloaded_size} (content {content_size}), expected {expected_size} for upload {upload}")
logging.debug("Done downloading files for %s", title) logging.debug("Done downloading files for %s", title)
except Exception as e: except Exception as e: