Trial The Second: Make leafo happy by not murdering his servers

Drop Selenium and clicking around itch - use the API instead. Pulls HTML5 games, is way faster, uses less resources on both ends.
2025-01-22 01:41:11 +01:00 · 2021-10-03 03:49:49 +02:00 · 2021-10-03 03:49:49 +02:00 · f86044050f
commit f86044050f
parent 5d0b8e1e99
3 changed files with 243 additions and 172 deletions
--- a/README.md
+++ b/README.md
@ -6,20 +6,21 @@ What you'll need:

 - Python 3.8+
 - `pip install -r requirements.txt`
- [chromedriver](https://chromedriver.chromium.org/downloads) somewhere in your PATH
+- For site mirroring, [wget](https://www.gnu.org/software/wget/) in your PATH.

-On Arch, `pacman -S python chromium python-selenium python-requests python-slugify` works.
+On Arch, `pacman -S wget python python-requests python-slugify` works.

 How to use this:

 - Go to your jam's page, ex. https://itch.io/jam/gbcompo21 and right-click -> View Source.
 - Ctrl-F for `"id":` - it should find that text once, followed by a number. Write it down.
+- (It you found it multiple times, grab the one after ViewJam something something.)
 - Download https://itch.io/jam/NUMBER/entries.json (replacing NUMBER with what you wrote down)
- Run the downloader: `python downloader.py entries.json`
+- Generate a new API key on your user account page: https://itch.io/user/settings/api-keys
+- Run the downloader: `python downloader.py --api-key <KEY> entries.json`
 - Wait. This is going to take a while.

-**This downloader does not (and probably will not) support HTML5-only games.** (For some of
-these, you might get lucky by hitting F12 while the game loads and grabbing what's in there.)
+The downloader is able to grab more or less everything you can download via the itch app.

 It's expected that the downloader output will not be complete - logs are stupidly verbose and
 it prints a report on successful/failed downloads, so you must manually grab whatever was not
@ -28,3 +29,5 @@ handled for you automatically for some reason.
 The downloader also grabs the entry page HTML, which usually comes with controls and such. It
 does not download images, external assets and so on, just the text - if the Itch page dies,
 so will most elements on those downloaded pages. Controls should survive, though.
+
+(There's a pedantic mirroring toggle in the script, if you know what you're doing though.)
--- a/downloader.py
+++ b/downloader.py
@ -4,21 +4,28 @@ import os
 import sys
 import json
 import time
+import shutil
 import hashlib
 import argparse
 import traceback
+import subprocess
 from enum import Enum
-from multiprocessing import Pool

 import requests
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry
+
 from slugify import slugify

-from selenium import webdriver
-from selenium.webdriver.chrome.options import Options
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.common.exceptions import NoSuchElementException, TimeoutException
+WGET_PATH = shutil.which("wget")
+if WGET_PATH is None:
+	print(f"Warning: wget not available, site mirroring will not work!")
+
+# Try to download all site assets, images etc included.
+# You probably don't want this, but here you go!
+PEDANTIC_MIRRORING = False
+
+ITCH_API = "https://api.itch.io"


 class ItchDownloadResult(Enum):
@ -28,206 +35,263 @@ class ItchDownloadResult(Enum):
 	DOWNLOAD_TIMEOUT = 3


+class ItchDownloadError(Exception):
+	pass
+
+
+class ItchApiClient():
+	def __init__(self, base_url: str, api_key: str):
+		self.base_url = base_url
+		self.api_key = api_key
+
+		self.requests = requests.Session()
+
+		retry_strategy = Retry(
+			total=5,
+			backoff_factor=10,
+			allowed_methods=["HEAD", "GET"],
+			status_forcelist=[429, 500, 502, 503, 504]
+		)
+
+		# No timeouts - set them explicitly on API calls below!
+		adapter = HTTPAdapter(max_retries=retry_strategy)
+		self.requests.mount("https://", adapter)
+		self.requests.mount("http://", adapter)
+
+	def add_api_key(self, kwargs):
+		# Adds the API key to request params, if one was not
+		# already provided outside of the client.
+		if 'data' in kwargs:
+			params = kwargs['data']
+		else:
+			params = {}
+			kwargs['data'] = params
+
+		if 'api_key' not in params:
+			params['api_key'] = self.api_key
+
+	def get(self, endpoint: str, *args, **kwargs):
+		self.add_api_key(kwargs)
+		return self.requests.get(self.base_url + endpoint, *args, **kwargs)
+
+
+def download_file(client: ItchApiClient, upload_id: int, download_path: str, print_url: bool=False):
+	# No timeouts, chunked uploads, default retry strategy, should be all good?
+	try:
+		with client.get(f"/uploads/{upload_id}/download", stream=True) as r:
+			r.raise_for_status()
+			if print_url:
+				print(f"Download URL: {r.url}")
+
+			with open(download_path, 'wb') as f:
+				for chunk in r.iter_content(chunk_size=1048576):  # 1MB chunks
+					f.write(chunk)
+	except requests.exceptions.HTTPError as e:
+		raise ItchDownloadError(f"Unrecoverable download error: {e}")
+
+
+def get_download_keys(client: ItchApiClient):
+	print("Fetching all download keys...")
+	download_keys = {}
+	page = 1
+
+	while True:
+		print(f"Downloading page {page}...")
+		try:
+			r = client.get("/profile/owned-keys", data={"page": page}, timeout=15)
+			r.raise_for_status()
+		except Exception as e:
+			print(f"Got error while fetching download keys: {e}")
+			print(f"Let's just pretend this is enough and move on...")
+			break
+
+		data = r.json()
+		if 'owned_keys' not in data:
+			break  # Assuming we're out of keys already...
+
+		for key in data['owned_keys']:
+			download_keys[key['game_id']] = key['id']
+
+		if len(data['owned_keys']) == data['per_page']:
+			page += 1
+		else:
+			break
+
+	print(f"Fetched {len(download_keys)} download keys.")
+	return download_keys
+
+
 def parse_jobs(jam_json: dict) -> list[tuple[int, str, str]]:
 	if 'jam_games' not in jam_json:
 		raise Exception("Provided JSON is not a valid itch.io jam JSON.")

 	# Extract (id, url) pairs from all the entries.
-	return [(e['game']['id'], e['game']['title'], e['game']['url']) for e in jam_json['jam_games']]
+	return [(int(e['game']['id']), e['game']['title'], e['game']['url']) for e in jam_json['jam_games']]


-def try_extract_download_links(driver: webdriver.Chrome, title_url: str) -> list[str]:
-	elems = driver.find_elements(By.CLASS_NAME, "download_btn")
-	if len(elems) == 0:
-		raise NoSuchElementException("No download links found.")
-
-	cookie = driver.get_cookie("itchio_token")['value']
-	file_ids = [elem.get_attribute("data-upload_id") for elem in elems]
-	file_urls = []
-
-	for file_id in file_ids:
-		meta_url = f"{title_url}/file/{file_id}"
-		r = requests.post(meta_url, data={"csrf_token": cookie})
-		if r.ok:
-			file_urls.append(r.json()['url'])
-		else:
-			print(f"Error downloading metadata for file {file_id} (status {r.status_code}): {r.text}")
-
-	print(f"Extracted URLs: {file_urls}")
-	return file_urls
-
-
-def download_link(link: str, path: str) -> tuple[bool, str]:
-	r = requests.get(link)
-	if not r.ok:
-		return (False, r.reason)
-
-	# The bytes we need:
-	content = r.content
-
-	# Figure out the filename:
-	if 'Content-Disposition' in r.headers:
-		name = r.headers['Content-Disposition']
-		name = name.removeprefix('attachment; filename="').removesuffix('"')
-	else:  # uhhhh random bullshit go, good luck?
-		md5 = hashlib.md5()
-		md5.update(content)
-		name = md5.hexdigest()
-
-	# Make sure we don't overwrite files with the same name.
-	fcounter = 1
-	filename = f"{path}/{name}"
-	while os.path.exists(filename):
-		fcounter += 1
-		filename = f"{path}/{name}.{fcounter}"
-
-	try:
-		with open(filename, 'wb') as f:
-			f.write(content)
-	except Exception as e:
-		return (False, f"Cannot write output file: {e}")
-
-	return (True, "Success")
-
-
-def download_files(links, path) -> list[tuple[bool, str]]:
-	if len(links) == 0:
-		print(f"Nothing to download into {path}")
-		return []
-
-	with Pool(len(links)) as p:
-		results = p.starmap(download_link, [(link, path) for link in links])
-		return results
-
-
-def parse_download_results(results, method) -> tuple[ItchDownloadResult, str]:
-	global_success = True
-	for success, reason in results:
-		if not success:
-			print(f"Download failed: {reason}")
-			global_success = False
-
-	if global_success:
-		return (ItchDownloadResult.SUCCESS, f"Method #{method} successful.")
-	else:
-		return (ItchDownloadResult.FAILURE, f"Method #{method} partially successful (downloads failed).")
-
-
-def download_title(title_id: int, title_url: str, download_path: str) -> (ItchDownloadResult, str):
-	options = Options()
-	options.add_argument("--headless")
-
-	with webdriver.Chrome(options=options) as driver:
-		wait = WebDriverWait(driver, timeout=15)
-		driver.get(title_url)
-
-		with open(f"{download_path}/index.html", 'w') as f:
-			f.write(driver.page_source)
-
-		skip_purchase_locator = (By.CLASS_NAME, "direct_download_btn")
-
-		try:
-			print("Trying method #1: Purchase Workflow")
-			elem = driver.find_element(By.CLASS_NAME, "buy_btn")
-			elem.click()
-
-			elem = wait.until(EC.presence_of_element_located(skip_purchase_locator))
-			elem.click()
-
-			wait.until(EC.number_of_windows_to_be(2))
-			time.sleep(1)
-			
-			first_tab = driver.current_window_handle
-			for window_handle in driver.window_handles:
-				if window_handle != first_tab:
-					driver.switch_to.window(window_handle)
-					break
-
-			# We're now on the main downloads page.
-			download_links = try_extract_download_links(driver, title_url)
-			results = download_files(download_links, download_path)
-			return parse_download_results(results, 1)
-		except TimeoutException:
-			print("Method #1 took too long - sleeping for 1m to avoid ~ mystery funsies ~")
-			time.sleep(60)
-
-			return ItchDownloadResult.DOWNLOAD_TIMEOUT, "Download timed out"
-		except NoSuchElementException:
-			print("Method #1 failed.")
-
-		try:
-			print("Trying method #2: Direct Download Workflow")
-			download_links = try_extract_download_links(driver, title_url)
-			results = download_files(download_links, download_path)
-			return parse_download_results(results, 2)
-		except NoSuchElementException:
-			print("Method #2 failed.")
-
-		print("File links missing/no method able to handle target URL.")
-		return ItchDownloadResult.MISSING_DOWNLOAD, "No download method worked."
-
-def download_jam(path_to_json: str, continue_from: str=None):
+def download_jam(path_to_json: str, download_to: str, api_key: str, continue_from: str=None):
 	try:
 		with open(path_to_json) as f:
 			jam_json = json.load(f)
 	except FileNotFoundError:
 		print(f"File {path_to_json} not found.")
 	except json.decoder.JSONDecodeError:
-		print(F"Provided file is not a valid JSON file.")
+		print(F"Provided entries file is not a valid JSON file.")
+
+	client = ItchApiClient(ITCH_API, api_key)
+
+	# Check API key validity:
+	profile_req = client.get("/profile")
+	if not profile_req.ok:
+		print(f"Provided API key appears to be invalid: {profile_req.text}")
+		exit(1)

 	jobs = parse_jobs(jam_json)
 	jobs_successful = []
 	jobs_failed = []

+	download_keys = get_download_keys(client)
+	game_id_to_meta = {}  # dict[game_id: int, (title: str, url: str)]
+
+	for game_id, title, url in jobs:
+		game_id_to_meta[game_id] = (title, url)
+
+	failed_game_ids = set()
+
 	# No "continue from"? Yep, start right away.
 	should_process_jobs = continue_from is None

-	for job in jobs:
-		game_id, title, url = job
+	for game_id, title, url in jobs:
+		label = f"{title} ({game_id})"
 		if not should_process_jobs:
 			if game_id == continue_from:
 				should_process_jobs = True
 			else:
 				continue

-		r = requests.get(f"{url}/data.json")
-		if r.status_code != 200:
-			print(f"Missing data for {url}, probably invalid")
-			failed_jobs += url
-			continue
+		try:
+			download_path = os.path.join(download_to, slugify(title))
+			if PEDANTIC_MIRRORING:
+				site_mirror_path = os.path.join(download_to, "_sites")
+			else:
+				site_mirror_path = os.path.join(download_path, "site")
+			os.makedirs(download_path, exist_ok=True)
+			os.makedirs(site_mirror_path, exist_ok=True)
+		except:
+			raise ItchDownloadError(f"Could not create download directory: {download_path}")

-		download_path = os.path.join(os.getcwd(), slugify(title))
-		print(f"Trying to download {title} ({game_id}) to {download_path}")
+		print(f"Trying to download {label} to {download_path}")

-		if not os.path.isdir(download_path):
-			os.mkdir(download_path)
+		if WGET_PATH is not None:
+			print("Downloading site...")
+			if PEDANTIC_MIRRORING:
+				extra_wget_args = [
+					"--timestamping",
+					"--span-hosts",
+					"--convert-links",
+					"--adjust-extension",
+					"--page-requisites",
+				]
+			else:
+				extra_wget_args = []
+
+			wget = subprocess.run([
+				WGET_PATH,
+				*extra_wget_args,
+				"--quiet",
+				url
+			], cwd=site_mirror_path)
+
+			if wget.returncode != 0:
+				print(f"Warning: Site mirroring failed/incomplete.")
+
+		creds = {}
+		if game_id in download_keys:
+			creds['download_key_id'] = download_keys[game_id]
+			print("Using {creds} for private uploads")
+
+		game_uploads_req = client.get(f"/games/{game_id}/uploads", data=creds, timeout=15)
+		if not game_uploads_req.ok:
+			raise ItchDownloadError(f"Could not fetch game uploads for {label}: {game_uploads_req.text}")
+
+		game_uploads = game_uploads_req.json()['uploads']
+		print(f"Found {len(game_uploads)} upload(s)")

 		try:
-			status, message = download_title(game_id, url, download_path)
-			print(f"{title}: {status}, {message}")
+			for upload in game_uploads:
+				upload_id = upload['id']
+				file_name = upload['filename']
+				file_size = upload['size']
+				upload_is_external = upload['storage'] == 'external'

-			if status == ItchDownloadResult.SUCCESS:
-				jobs_successful.append((title, download_path))
-			else:
-				jobs_failed.append((status, title, url, message))
-		except Exception as e:
-			print(f"Download failed for {title} ({game_id}): {e}")
-			traceback.print_exc()
+				print(f"Downloading '{file_name}' ({upload_id}), {file_size} bytes...")
+				if upload_is_external:
+					print("***********************************************************")
+					print("*                                                         *")
+					print("* WARNING: External storage - downloads will likely fail. *")
+					print("*         Check the URL displayed below manually!         *")
+					print("*                                                         *")
+					print("***********************************************************")
+
+				target_path = os.path.join(download_path, file_name)
+				try:
+					download_file(client, upload_id, target_path, print_url=upload_is_external)
+				except ItchDownloadError as e:
+					jobs_failed.append((game_id, file_name, str(e)))
+					print(f"Download failed for {file_name}: {e}")
 					continue

-	print(f"\nAll done, downloaded files successfully for {len(jobs_successful)} title(s):")
-	for title, download_path in jobs_successful:
-		print(title)
+				try:
+					actual_file_size = os.stat(target_path).st_size
+					if actual_file_size == file_size:
+						jobs_successful.append((game_id, file_name))
+					else:
+						jobs_failed.append((game_id, file_name, f"File size is {actual_file_size}, expected {file_size}"))
+				except FileNotFoundError:
+					jobs_failed.append((game_id, file_name, "Could not download file"))

-	print(f"\nDownloads failed for {len(jobs_failed)} title(s):")
-	for status, title, url, message in jobs_failed:
-		print(f"{title} - {url} - {status}: {message}")
+			print(f"Done downloading {label}")
+		except ItchDownloadError as e:
+			failed_game_ids.append((game_id, str(e)))
+			print(message)
+			continue
+		except Exception as e:
+			print(f"Critical error while downloading {label}: {e}")
+			failed_game_ids.append((game_id, str(e)))
+			traceback.print_exc()
+			print(message)
+			continue
+
+	successful_titles = {}
+	for game_id, file_name in jobs_successful:
+		if game_id not in successful_titles:
+			successful_titles[game_id] = [file_name]
+
+	if any(successful_titles):
+		print(f"\nAll done, downloaded files for {len(successful_titles)} title(s):")
+		for game_id, files in successful_titles.items():
+			print(f"{game_id_to_meta[game_id][0]}, {len(files)} file(s)")
+
+	if any(jobs_failed):
+		print(f"\nDownloads failed for {len(jobs_failed)} file(s):")
+		for game_id, file_name, message in jobs_failed:
+			title, url = game_id_to_meta[game_id]
+			print(f"{title} - {file_name} - {message}")
+			print(f"Title URL: {url}")
+
+	if any(failed_game_ids):
+		print(f"\nCompletely failed downloads for {len(failed_game_ids)} titles:")
+		for game_id, message in failed_game_ids:
+			title, url = game_id_to_meta[game_id]
+			print(f"{title} ({game_id}) - {url} - {message}")


 if __name__ == "__main__":
 	parser = argparse.ArgumentParser(description="Downloads games from public Itch.io game jams.")
 	parser.add_argument("entries", help="path to the game jam entries.json file")
-	parser.add_argument("--continue-from", metavar="ID", help="skip all entries until the provided entry ID is found")
+	parser.add_argument("--api-key", metavar="key", required=True, help="itch.io API key from https://itch.io/user/settings/api-keys")
+	parser.add_argument("--download-to", metavar="path", help="directory to save results into (default: current dir)")
+	parser.add_argument("--continue-from", metavar="id", help="skip all entries until the provided entry ID is found")
 	args = parser.parse_args()

 	continue_id = args.continue_from
@ -238,4 +302,9 @@ if __name__ == "__main__":
 			print("ID to continue from must be an integer.")
 			exit(1)

-	download_jam(args.entries, continue_from=continue_id)
+	download_to = os.getcwd()
+	if args.download_to is not None:
+		download_to = os.path.normpath(args.download_to)
+		os.makedirs(download_to)
+
+	download_jam(args.entries, download_to, args.api_key, continue_from=continue_id)
--- a/requirements.txt
+++ b/requirements.txt
@ -1,3 +1,2 @@
-selenium
 requests
 python-slugify