diff --git a/itch_dl/cli.py b/itch_dl/cli.py index 8a1b1ef..5cdcb83 100644 --- a/itch_dl/cli.py +++ b/itch_dl/cli.py @@ -6,6 +6,7 @@ from .handlers import get_jobs_for_url_or_path from .downloader import drive_downloads from .keys import get_download_keys from .api import ItchApiClient + logging.basicConfig() logging.getLogger().setLevel(logging.INFO) diff --git a/itch_dl/consts.py b/itch_dl/consts.py index 83d8ac8..fac9493 100644 --- a/itch_dl/consts.py +++ b/itch_dl/consts.py @@ -1,9 +1,10 @@ -from enum import Enum - ITCH_BASE = "itch.io" ITCH_URL = f"https://{ITCH_BASE}" ITCH_API = f"https://api.{ITCH_BASE}" +# Extracts https://user.itch.io/gamename to {'author': 'user', 'game': 'gamename'} +ITCH_GAME_URL_REGEX = r"^https:\/\/(?P[\w\d\-_]+).itch.io\/(?P[\w\d\-_]+)$" + ITCH_BROWSER_TYPES = [ "games", "tools", @@ -15,15 +16,3 @@ ITCH_BROWSER_TYPES = [ "game-mods", "misc", ] - - -class ItchDownloadResult(Enum): - SUCCESS = 0 - FAILURE = 1 - MISSING_DOWNLOAD = 2 - DOWNLOAD_TIMEOUT = 3 - - -# I mean, not really a const but eh -class ItchDownloadError(Exception): - pass diff --git a/itch_dl/downloader.py b/itch_dl/downloader.py index 79e5c95..79f0a89 100644 --- a/itch_dl/downloader.py +++ b/itch_dl/downloader.py @@ -1,232 +1,44 @@ import os -import shutil +import json +import re import logging -import traceback -import subprocess from typing import Tuple, List, Dict, TypedDict, Optional -from slugify import slugify +from bs4 import BeautifulSoup from requests.exceptions import HTTPError from tqdm import tqdm from tqdm.contrib.concurrent import thread_map from .api import ItchApiClient -from .consts import ItchDownloadError, ItchDownloadResult +from .utils import ItchDownloadError, get_int_after_marker_in_json +from .consts import ITCH_GAME_URL_REGEX -# ------------------------------ -# --- OLD STUFF --- CUT HERE --- -# ------------------------------ - - -WGET_PATH = shutil.which("wget") -if WGET_PATH is None: - print(f"Warning: wget not available, site mirroring will not work!") - - -def download_file(client: ItchApiClient, upload_id: int, download_path: str, creds: dict, print_url: bool=False): - # No timeouts, chunked uploads, default retry strategy, should be all good? - try: - with client.get(f"/uploads/{upload_id}/download", data=creds, stream=True) as r: - r.raise_for_status() - if print_url: - print(f"Download URL: {r.url}") - - with open(download_path, 'wb') as f: - for chunk in r.iter_content(chunk_size=1048576): # 1MB chunks - f.write(chunk) - except HTTPError as e: - raise ItchDownloadError(f"Unrecoverable download error: {e}") - - -def get_meta_for_game_url(game_url: str) -> Tuple[int, str]: - """Finds the Game ID and Title for a Game URL.""" - data_url = game_url.rstrip("/") + "/data.json" - data_req = requests.get(data_url) - r.raise_for_status() - - data_json = data_req.json() - if not 'id' in data_json: - raise ItchDownloadError(f"Cannot fetch the Game ID for URL: {game_url}") - - return data_json['id'] - - - - - - - -def download_jam(jam_path: str, download_to: str, api_key: str, continue_from: str=None): - client = ItchApiClient(api_key) - jam_json = get_game_jam_json(jam_path) - - # Check API key validity: - profile_req = client.get("/profile") - if not profile_req.ok: - print(f"Provided API key appears to be invalid: {profile_req.text}") - exit(1) - - jobs = parse_jobs(jam_json) - jobs_successful = [] - jobs_failed = [] - - game_id_to_meta = {} # dict[game_id: int, (title: str, url: str)] - - for game_id, title, url in jobs: - game_id_to_meta[game_id] = (title, url) - - failed_game_ids = set() - - # No "continue from"? Yep, start right away. - should_process_jobs = continue_from is None - - for game_id, title, url in jobs: - label = f"{title} ({game_id})" - if not should_process_jobs: - if game_id == continue_from: - should_process_jobs = True - else: - continue - - try: - download_path = os.path.join(download_to, slugify(title)) - if PEDANTIC_MIRRORING: - site_mirror_path = os.path.join(download_to, "_sites") - else: - site_mirror_path = os.path.join(download_path, "site") - os.makedirs(download_path, exist_ok=True) - os.makedirs(site_mirror_path, exist_ok=True) - except: - raise ItchDownloadError(f"Could not create download directory: {download_path}") - - print(f"Trying to download {label} to {download_path}") - - if WGET_PATH is not None: - print("Downloading site...") - if PEDANTIC_MIRRORING: - extra_wget_args = [ - "--timestamping", - "--span-hosts", - "--convert-links", - "--adjust-extension", - "--page-requisites", - ] - else: - extra_wget_args = [] - - wget = subprocess.run([ - WGET_PATH, - *extra_wget_args, - "--quiet", - url - ], cwd=site_mirror_path) - - if wget.returncode != 0: - print(f"Warning: Site mirroring failed/incomplete.") - - creds = {} - if game_id in self.download_keys: - creds['download_key_id'] = self.download_keys[game_id] - print("Using {creds} for private uploads") - - game_uploads_req = client.get(f"/games/{game_id}/uploads", data=creds, timeout=15) - if not game_uploads_req.ok: - raise ItchDownloadError(f"Could not fetch game uploads for {label}: {game_uploads_req.text}") - - game_uploads = game_uploads_req.json()['uploads'] - print(f"Found {len(game_uploads)} upload(s)") - - try: - for upload in game_uploads: - upload_id = upload['id'] - file_name = upload['filename'] - file_size = upload['size'] - upload_is_external = upload['storage'] == 'external' - - print(f"Downloading '{file_name}' ({upload_id}), {file_size} bytes...") - if upload_is_external: - print("***********************************************************") - print("* *") - print("* WARNING: External storage - downloads will likely fail. *") - print("* Check the URL displayed below manually! *") - print("* *") - print("***********************************************************") - - target_path = os.path.join(download_path, file_name) - try: - download_file(client, upload_id, target_path, creds, print_url=upload_is_external) - except ItchDownloadError as e: - jobs_failed.append((game_id, file_name, str(e))) - print(f"Download failed for {file_name}: {e}") - continue - - try: - actual_file_size = os.stat(target_path).st_size - if actual_file_size == file_size: - jobs_successful.append((game_id, file_name)) - else: - jobs_failed.append((game_id, file_name, f"File size is {actual_file_size}, expected {file_size}")) - except FileNotFoundError: - jobs_failed.append((game_id, file_name, "Could not download file")) - - print(f"Done downloading {label}") - except ItchDownloadError as e: - failed_game_ids.append((game_id, str(e))) - print(message) - continue - except Exception as e: - print(f"Critical error while downloading {label}: {e}") - failed_game_ids.append((game_id, str(e))) - traceback.print_exc() - print(message) - continue - - successful_titles = {} - for game_id, file_name in jobs_successful: - if game_id not in successful_titles: - successful_titles[game_id] = [file_name] - - if any(successful_titles): - print(f"\nAll done, downloaded files for {len(successful_titles)} title(s):") - for game_id, files in successful_titles.items(): - print(f"{game_id_to_meta[game_id][0]}, {len(files)} file(s)") - - if any(jobs_failed): - print(f"\nDownloads failed for {len(jobs_failed)} file(s):") - for game_id, file_name, message in jobs_failed: - title, url = game_id_to_meta[game_id] - print(f"{title} - {file_name} - {message}") - print(f"Title URL: {url}") - - if any(failed_game_ids): - print(f"\nCompletely failed downloads for {len(failed_game_ids)} titles:") - for game_id, message in failed_game_ids: - title, url = game_id_to_meta[game_id] - print(f"{title} ({game_id}) - {url} - {message}") - - -# ------------------------------ -# --- OLD STUFF --- CUT HERE --- -# ------------------------------ - - -class GameAuthor(TypedDict, total=False): - name: str - url: str +TARGET_PATHS = { + 'site': 'site.html', + 'metadata': 'metadata.json', + 'files': 'files', + 'screenshots': 'screenshots' +} class GameMetadata(TypedDict, total=False): - description: str - - -class GameDownloadJob(TypedDict, total=False): - url: str game_id: int title: str - author: GameAuthor - metadata: GameMetadata + url: str + + errors: List[str] + external_downloads: List[str] + + author: str + author_url: str + + description: str + cover_url: str + + created_at: str + published_at: str class GameDownloader: @@ -236,16 +48,203 @@ class GameDownloader: self.client = ItchApiClient(api_key) - def download(self, url: str): - job = GameDownloadJob(url=url) - raise NotImplementedError("Not yet!") + def get_rating_json(self, site) -> Optional[dict]: + for ldjson_node in site.find_all("script", type="application/ld+json"): + try: + ldjson: dict = json.loads(ldjson_node.text.strip()) + if ldjson.get("@type") == "Product": + return ldjson + except json.JSONDecodeError: + continue # Can't do much with this... + + return None + + def get_meta(self, site, **kwargs) -> Optional[str]: + """Grabs values.""" + node = site.find("meta", attrs=kwargs) + if not node: + return None + + return node.get("content") + + def get_game_id(self, url: str, site: BeautifulSoup) -> int: + game_id: Optional[int] = None + + try: + # Headers: + itch_path = self.get_meta(site, name="itch:path") + if itch_path is not None: + # Its value should be "games/12345", so: + game_id = int(itch_path.split("/")[-1]) + except ValueError: + pass + + if game_id is None: + # I.ViewGame has the "id" key in its config + for script in site.find_all("script", type="text/javascript"): + script_src = script.text.strip() + marker = "I.ViewGame" + if marker in script_src: + game_id = get_int_after_marker_in_json(script_src, marker, "id") + break + + if game_id is None: + # We have to hit the server again :( + data_url = url.rstrip('/') + "/data.json" + data_request = self.client.get(data_url, append_api_key=False) + if data_request.ok: + try: + game_id = int(data_request.json().get("id")) + except ValueError: + pass + + if game_id is None: + raise ItchDownloadError(f"Could not get the Game ID for URL: {url}") + + return game_id + + def extract_metadata(self, game_id: int, url: str, site: BeautifulSoup) -> GameMetadata: + description: Optional[str] = self.get_meta(site, property="og:description") + if not description: + description = self.get_meta(site, name="description") + + metadata = GameMetadata( + game_id=game_id, + title=site.find("h1", class_="game_title").text.strip(), + url=url, + cover_url=self.get_meta(site, property="og:image"), + description=description + ) + + TODO_KEYS = ['author', 'author_url', 'created_at', 'published_at'] + TODO_rating_json: Optional[dict] = self.get_rating_json(site) + + return metadata + + def get_credentials(self, title: str, game_id: int) -> dict: + credentials = {} + if game_id in self.download_keys: + credentials['download_key_id'] = self.download_keys[game_id] + logging.debug("Got credentials for %s: %s", title, str(credentials)) + + return credentials + + def download_file(self, upload_id: int, download_path: Optional[str], creds: dict) -> str: + """Performs a request to download a given upload by its ID, optionally saves the + file to the provided path and returns the final URL that was downloaded.""" + try: + # No timeouts, chunked uploads, default retry strategy, should be all good? + with self.client.get(f"/uploads/{upload_id}/download", data=creds, stream=True) as r: + r.raise_for_status() + + if download_path is not None: # ...and it will be for external downloads. + with tqdm.wrapattr(open(download_path, "wb"), "write", + miniters=1, desc=str(upload_id), + total=int(r.headers.get('content-length', 0))) as f: + for chunk in r.iter_content(chunk_size=1048576): # 1MB chunks + f.write(chunk) + + return r.url + except HTTPError as e: + raise ItchDownloadError(f"Unrecoverable download error: {e}") + + def download(self, url: str, skip_downloaded: bool = True): + match = re.match(ITCH_GAME_URL_REGEX, url) + if not match: + raise ItchDownloadError(f"Game URL is invalid: {url} - please file a new issue.") + + author, game = match['author'], match['game'] + + download_path = os.path.join(self.download_to, author, game) + os.makedirs(download_path, exist_ok=True) + + paths: Dict[str, str] = {k: os.path.join(download_path, v) for k, v in TARGET_PATHS.items()} + + if os.path.exists(paths['metadata']) and skip_downloaded: + # As metadata is the final file we write, all the files + # should already be downloaded at this point. + logging.info("Skipping already-downloaded game for URL: %s", url) + return + + logging.info("Downloading %s", url) + r = self.client.get(url, append_api_key=False) + if not r.ok: + raise ItchDownloadError(f"Could not download the game site for {url}") + + site = BeautifulSoup(r.text, features="lxml") + game_id = self.get_game_id(url, site) + + metadata = self.extract_metadata(game_id, url, site) + title = metadata['title'] or game + + credentials = self.get_credentials(title, game_id) + game_uploads_req = self.client.get(f"/games/{game_id}/uploads", data=credentials, timeout=15) + if not game_uploads_req.ok: + raise ItchDownloadError(f"Could not fetch game uploads for {title}: {game_uploads_req.text}") + + game_uploads = game_uploads_req.json()['uploads'] + print(f"Found {len(game_uploads)} upload(s)") + logging.debug(str(game_uploads)) + + external_urls = [] + errors = [] + + try: + os.makedirs(paths['files'], exist_ok=True) + for upload in tqdm(game_uploads, desc=title): + upload_id = upload['id'] + file_name = upload['filename'] + file_size = upload['size'] + upload_is_external = upload['storage'] == 'external' + + print(f"Downloading '{file_name}' ({upload_id}), {file_size} bytes...") + target_path = None if upload_is_external else os.path.join(paths['files'], file_name) + + try: + target_url = self.download_file(upload_id, target_path, credentials) + except ItchDownloadError as e: + errors.append(f"Download failed for upload {upload}: {e}") + continue + + if upload_is_external: + logging.info("Found external download URL for %s: %s", target_url) + external_urls.append(target_url) + + try: + actual_file_size = os.stat(target_path).st_size + if actual_file_size != file_size: + errors.append(f"File size is {actual_file_size}, but expected {file_size} for upload {upload}") + except FileNotFoundError: + errors.append(f"Downloaded file not found for upload {upload}") + + logging.info("Done downloading files for %s", title) + except Exception as e: + error = f"Download failed for {title}: {e}" + logging.exception(error) + errors.append(error) + + metadata['errors'] = errors + metadata['external_downloads'] = external_urls + + if len(external_urls) > 0: + print(f"WARNING: Game {title} has external download URLs: {external_urls}") + + # TODO: Screenshots and site assets + with open(paths['site'], 'w') as f: + f.write(site.prettify()) + + with open(paths['metadata'], 'w') as f: + json.dump(metadata, f) + + logging.info("Finished job %s (%s)", url, title) def drive_downloads(jobs: List[str], download_to: str, api_key: str, keys: Dict[int, str], parallel: int = 1): downloader = GameDownloader(download_to, api_key, keys) if parallel > 1: - thread_map(downloader.download, jobs, max_workers=parallel, ) + results = thread_map(downloader.download, jobs, desc="Games", max_workers=parallel) else: - for job in tqdm(jobs): - downloader.download(job) + results = [downloader.download(job) for job in tqdm(jobs, desc="Games")] + + print(results) diff --git a/itch_dl/handlers.py b/itch_dl/handlers.py index 184db57..f1d57d2 100644 --- a/itch_dl/handlers.py +++ b/itch_dl/handlers.py @@ -1,4 +1,3 @@ -import re import json import os.path import logging @@ -8,7 +7,8 @@ from typing import List, Optional from bs4 import BeautifulSoup from .api import ItchApiClient -from .consts import ITCH_BASE, ITCH_URL, ITCH_BROWSER_TYPES, ItchDownloadError +from .utils import ItchDownloadError, get_int_after_marker_in_json +from .consts import ITCH_BASE, ITCH_URL, ITCH_BROWSER_TYPES def get_jobs_for_game_jam_json(game_jam_json: dict) -> List[str]: @@ -18,34 +18,6 @@ def get_jobs_for_game_jam_json(game_jam_json: dict) -> List[str]: return [g['game']['url'] for g in game_jam_json['jam_games']] -def get_int_after_marker_in_json(text: str, marker: str, key: str) -> Optional[int]: - """ - Many itch.io sites use a pattern like this: Most of the HTML page - is prerendered, but certain interactive objects are handled with - JavaScript initialized with `I.WidgetHandler({"id": 123, ...})` - somewhere near the end of each page. Those config blocks often - contain metadata like game/page IDs that we want to extract. - """ - marker_line: Optional[str] = None - for line in reversed(text.splitlines()): - marker_index = line.find(marker) - if marker_index != -1: - marker_line = line[marker_index:] - break - - if marker_line is None: - return None - - # Notice double-slashes in the f-string (not r-string)! - pattern = f'\\"{key}\\":\\s?(\\d+)' - - found_ints = re.findall(pattern, marker_line) - if len(found_ints) != 1: - return None - - return int(found_ints[0]) - - def get_game_jam_json(jam_url: str, client: ItchApiClient) -> dict: r = client.get(jam_url) if not r.ok: diff --git a/itch_dl/utils.py b/itch_dl/utils.py new file mode 100644 index 0000000..9a002ce --- /dev/null +++ b/itch_dl/utils.py @@ -0,0 +1,34 @@ +import re +from typing import Optional + + +class ItchDownloadError(Exception): + pass + + +def get_int_after_marker_in_json(text: str, marker: str, key: str) -> Optional[int]: + """ + Many itch.io sites use a pattern like this: Most of the HTML page + is prerendered, but certain interactive objects are handled with + JavaScript initialized with `I.WidgetHandler({"id": 123, ...})` + somewhere near the end of each page. Those config blocks often + contain metadata like game/page IDs that we want to extract. + """ + marker_line: Optional[str] = None + for line in reversed(text.splitlines()): + marker_index = line.find(marker) + if marker_index != -1: + marker_line = line[marker_index:] + break + + if marker_line is None: + return None + + # Notice double-slashes in the f-string (not r-string)! + pattern = f'\\"{key}\\":\\s?(\\d+)' + + found_ints = re.findall(pattern, marker_line) + if len(found_ints) != 1: + return None + + return int(found_ints[0])