Trial The Second: Make leafo happy by not murdering his servers

Drop Selenium and clicking around itch - use the API instead.
Pulls HTML5 games, is way faster, uses less resources on both ends.
This commit is contained in:
Ryszard Knop 2021-10-03 03:49:49 +02:00
parent 5d0b8e1e99
commit f86044050f
3 changed files with 243 additions and 172 deletions

View File

@ -6,20 +6,21 @@ What you'll need:
- Python 3.8+ - Python 3.8+
- `pip install -r requirements.txt` - `pip install -r requirements.txt`
- [chromedriver](https://chromedriver.chromium.org/downloads) somewhere in your PATH - For site mirroring, [wget](https://www.gnu.org/software/wget/) in your PATH.
On Arch, `pacman -S python chromium python-selenium python-requests python-slugify` works. On Arch, `pacman -S wget python python-requests python-slugify` works.
How to use this: How to use this:
- Go to your jam's page, ex. https://itch.io/jam/gbcompo21 and right-click -> View Source. - Go to your jam's page, ex. https://itch.io/jam/gbcompo21 and right-click -> View Source.
- Ctrl-F for `"id":` - it should find that text once, followed by a number. Write it down. - Ctrl-F for `"id":` - it should find that text once, followed by a number. Write it down.
- (It you found it multiple times, grab the one after ViewJam something something.)
- Download https://itch.io/jam/NUMBER/entries.json (replacing NUMBER with what you wrote down) - Download https://itch.io/jam/NUMBER/entries.json (replacing NUMBER with what you wrote down)
- Run the downloader: `python downloader.py entries.json` - Generate a new API key on your user account page: https://itch.io/user/settings/api-keys
- Run the downloader: `python downloader.py --api-key <KEY> entries.json`
- Wait. This is going to take a while. - Wait. This is going to take a while.
**This downloader does not (and probably will not) support HTML5-only games.** (For some of The downloader is able to grab more or less everything you can download via the itch app.
these, you might get lucky by hitting F12 while the game loads and grabbing what's in there.)
It's expected that the downloader output will not be complete - logs are stupidly verbose and It's expected that the downloader output will not be complete - logs are stupidly verbose and
it prints a report on successful/failed downloads, so you must manually grab whatever was not it prints a report on successful/failed downloads, so you must manually grab whatever was not
@ -28,3 +29,5 @@ handled for you automatically for some reason.
The downloader also grabs the entry page HTML, which usually comes with controls and such. It The downloader also grabs the entry page HTML, which usually comes with controls and such. It
does not download images, external assets and so on, just the text - if the Itch page dies, does not download images, external assets and so on, just the text - if the Itch page dies,
so will most elements on those downloaded pages. Controls should survive, though. so will most elements on those downloaded pages. Controls should survive, though.
(There's a pedantic mirroring toggle in the script, if you know what you're doing though.)

405
downloader.py Normal file → Executable file
View File

@ -4,21 +4,28 @@ import os
import sys import sys
import json import json
import time import time
import shutil
import hashlib import hashlib
import argparse import argparse
import traceback import traceback
import subprocess
from enum import Enum from enum import Enum
from multiprocessing import Pool
import requests import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from slugify import slugify from slugify import slugify
from selenium import webdriver WGET_PATH = shutil.which("wget")
from selenium.webdriver.chrome.options import Options if WGET_PATH is None:
from selenium.webdriver.common.by import By print(f"Warning: wget not available, site mirroring will not work!")
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC # Try to download all site assets, images etc included.
from selenium.common.exceptions import NoSuchElementException, TimeoutException # You probably don't want this, but here you go!
PEDANTIC_MIRRORING = False
ITCH_API = "https://api.itch.io"
class ItchDownloadResult(Enum): class ItchDownloadResult(Enum):
@ -28,206 +35,263 @@ class ItchDownloadResult(Enum):
DOWNLOAD_TIMEOUT = 3 DOWNLOAD_TIMEOUT = 3
class ItchDownloadError(Exception):
pass
class ItchApiClient():
def __init__(self, base_url: str, api_key: str):
self.base_url = base_url
self.api_key = api_key
self.requests = requests.Session()
retry_strategy = Retry(
total=5,
backoff_factor=10,
allowed_methods=["HEAD", "GET"],
status_forcelist=[429, 500, 502, 503, 504]
)
# No timeouts - set them explicitly on API calls below!
adapter = HTTPAdapter(max_retries=retry_strategy)
self.requests.mount("https://", adapter)
self.requests.mount("http://", adapter)
def add_api_key(self, kwargs):
# Adds the API key to request params, if one was not
# already provided outside of the client.
if 'data' in kwargs:
params = kwargs['data']
else:
params = {}
kwargs['data'] = params
if 'api_key' not in params:
params['api_key'] = self.api_key
def get(self, endpoint: str, *args, **kwargs):
self.add_api_key(kwargs)
return self.requests.get(self.base_url + endpoint, *args, **kwargs)
def download_file(client: ItchApiClient, upload_id: int, download_path: str, print_url: bool=False):
# No timeouts, chunked uploads, default retry strategy, should be all good?
try:
with client.get(f"/uploads/{upload_id}/download", stream=True) as r:
r.raise_for_status()
if print_url:
print(f"Download URL: {r.url}")
with open(download_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1048576): # 1MB chunks
f.write(chunk)
except requests.exceptions.HTTPError as e:
raise ItchDownloadError(f"Unrecoverable download error: {e}")
def get_download_keys(client: ItchApiClient):
print("Fetching all download keys...")
download_keys = {}
page = 1
while True:
print(f"Downloading page {page}...")
try:
r = client.get("/profile/owned-keys", data={"page": page}, timeout=15)
r.raise_for_status()
except Exception as e:
print(f"Got error while fetching download keys: {e}")
print(f"Let's just pretend this is enough and move on...")
break
data = r.json()
if 'owned_keys' not in data:
break # Assuming we're out of keys already...
for key in data['owned_keys']:
download_keys[key['game_id']] = key['id']
if len(data['owned_keys']) == data['per_page']:
page += 1
else:
break
print(f"Fetched {len(download_keys)} download keys.")
return download_keys
def parse_jobs(jam_json: dict) -> list[tuple[int, str, str]]: def parse_jobs(jam_json: dict) -> list[tuple[int, str, str]]:
if 'jam_games' not in jam_json: if 'jam_games' not in jam_json:
raise Exception("Provided JSON is not a valid itch.io jam JSON.") raise Exception("Provided JSON is not a valid itch.io jam JSON.")
# Extract (id, url) pairs from all the entries. # Extract (id, url) pairs from all the entries.
return [(e['game']['id'], e['game']['title'], e['game']['url']) for e in jam_json['jam_games']] return [(int(e['game']['id']), e['game']['title'], e['game']['url']) for e in jam_json['jam_games']]
def try_extract_download_links(driver: webdriver.Chrome, title_url: str) -> list[str]: def download_jam(path_to_json: str, download_to: str, api_key: str, continue_from: str=None):
elems = driver.find_elements(By.CLASS_NAME, "download_btn")
if len(elems) == 0:
raise NoSuchElementException("No download links found.")
cookie = driver.get_cookie("itchio_token")['value']
file_ids = [elem.get_attribute("data-upload_id") for elem in elems]
file_urls = []
for file_id in file_ids:
meta_url = f"{title_url}/file/{file_id}"
r = requests.post(meta_url, data={"csrf_token": cookie})
if r.ok:
file_urls.append(r.json()['url'])
else:
print(f"Error downloading metadata for file {file_id} (status {r.status_code}): {r.text}")
print(f"Extracted URLs: {file_urls}")
return file_urls
def download_link(link: str, path: str) -> tuple[bool, str]:
r = requests.get(link)
if not r.ok:
return (False, r.reason)
# The bytes we need:
content = r.content
# Figure out the filename:
if 'Content-Disposition' in r.headers:
name = r.headers['Content-Disposition']
name = name.removeprefix('attachment; filename="').removesuffix('"')
else: # uhhhh random bullshit go, good luck?
md5 = hashlib.md5()
md5.update(content)
name = md5.hexdigest()
# Make sure we don't overwrite files with the same name.
fcounter = 1
filename = f"{path}/{name}"
while os.path.exists(filename):
fcounter += 1
filename = f"{path}/{name}.{fcounter}"
try:
with open(filename, 'wb') as f:
f.write(content)
except Exception as e:
return (False, f"Cannot write output file: {e}")
return (True, "Success")
def download_files(links, path) -> list[tuple[bool, str]]:
if len(links) == 0:
print(f"Nothing to download into {path}")
return []
with Pool(len(links)) as p:
results = p.starmap(download_link, [(link, path) for link in links])
return results
def parse_download_results(results, method) -> tuple[ItchDownloadResult, str]:
global_success = True
for success, reason in results:
if not success:
print(f"Download failed: {reason}")
global_success = False
if global_success:
return (ItchDownloadResult.SUCCESS, f"Method #{method} successful.")
else:
return (ItchDownloadResult.FAILURE, f"Method #{method} partially successful (downloads failed).")
def download_title(title_id: int, title_url: str, download_path: str) -> (ItchDownloadResult, str):
options = Options()
options.add_argument("--headless")
with webdriver.Chrome(options=options) as driver:
wait = WebDriverWait(driver, timeout=15)
driver.get(title_url)
with open(f"{download_path}/index.html", 'w') as f:
f.write(driver.page_source)
skip_purchase_locator = (By.CLASS_NAME, "direct_download_btn")
try:
print("Trying method #1: Purchase Workflow")
elem = driver.find_element(By.CLASS_NAME, "buy_btn")
elem.click()
elem = wait.until(EC.presence_of_element_located(skip_purchase_locator))
elem.click()
wait.until(EC.number_of_windows_to_be(2))
time.sleep(1)
first_tab = driver.current_window_handle
for window_handle in driver.window_handles:
if window_handle != first_tab:
driver.switch_to.window(window_handle)
break
# We're now on the main downloads page.
download_links = try_extract_download_links(driver, title_url)
results = download_files(download_links, download_path)
return parse_download_results(results, 1)
except TimeoutException:
print("Method #1 took too long - sleeping for 1m to avoid ~ mystery funsies ~")
time.sleep(60)
return ItchDownloadResult.DOWNLOAD_TIMEOUT, "Download timed out"
except NoSuchElementException:
print("Method #1 failed.")
try:
print("Trying method #2: Direct Download Workflow")
download_links = try_extract_download_links(driver, title_url)
results = download_files(download_links, download_path)
return parse_download_results(results, 2)
except NoSuchElementException:
print("Method #2 failed.")
print("File links missing/no method able to handle target URL.")
return ItchDownloadResult.MISSING_DOWNLOAD, "No download method worked."
def download_jam(path_to_json: str, continue_from: str=None):
try: try:
with open(path_to_json) as f: with open(path_to_json) as f:
jam_json = json.load(f) jam_json = json.load(f)
except FileNotFoundError: except FileNotFoundError:
print(f"File {path_to_json} not found.") print(f"File {path_to_json} not found.")
except json.decoder.JSONDecodeError: except json.decoder.JSONDecodeError:
print(F"Provided file is not a valid JSON file.") print(F"Provided entries file is not a valid JSON file.")
client = ItchApiClient(ITCH_API, api_key)
# Check API key validity:
profile_req = client.get("/profile")
if not profile_req.ok:
print(f"Provided API key appears to be invalid: {profile_req.text}")
exit(1)
jobs = parse_jobs(jam_json) jobs = parse_jobs(jam_json)
jobs_successful = [] jobs_successful = []
jobs_failed = [] jobs_failed = []
download_keys = get_download_keys(client)
game_id_to_meta = {} # dict[game_id: int, (title: str, url: str)]
for game_id, title, url in jobs:
game_id_to_meta[game_id] = (title, url)
failed_game_ids = set()
# No "continue from"? Yep, start right away. # No "continue from"? Yep, start right away.
should_process_jobs = continue_from is None should_process_jobs = continue_from is None
for job in jobs: for game_id, title, url in jobs:
game_id, title, url = job label = f"{title} ({game_id})"
if not should_process_jobs: if not should_process_jobs:
if game_id == continue_from: if game_id == continue_from:
should_process_jobs = True should_process_jobs = True
else: else:
continue continue
r = requests.get(f"{url}/data.json") try:
if r.status_code != 200: download_path = os.path.join(download_to, slugify(title))
print(f"Missing data for {url}, probably invalid") if PEDANTIC_MIRRORING:
failed_jobs += url site_mirror_path = os.path.join(download_to, "_sites")
continue else:
site_mirror_path = os.path.join(download_path, "site")
os.makedirs(download_path, exist_ok=True)
os.makedirs(site_mirror_path, exist_ok=True)
except:
raise ItchDownloadError(f"Could not create download directory: {download_path}")
download_path = os.path.join(os.getcwd(), slugify(title)) print(f"Trying to download {label} to {download_path}")
print(f"Trying to download {title} ({game_id}) to {download_path}")
if not os.path.isdir(download_path): if WGET_PATH is not None:
os.mkdir(download_path) print("Downloading site...")
if PEDANTIC_MIRRORING:
extra_wget_args = [
"--timestamping",
"--span-hosts",
"--convert-links",
"--adjust-extension",
"--page-requisites",
]
else:
extra_wget_args = []
wget = subprocess.run([
WGET_PATH,
*extra_wget_args,
"--quiet",
url
], cwd=site_mirror_path)
if wget.returncode != 0:
print(f"Warning: Site mirroring failed/incomplete.")
creds = {}
if game_id in download_keys:
creds['download_key_id'] = download_keys[game_id]
print("Using {creds} for private uploads")
game_uploads_req = client.get(f"/games/{game_id}/uploads", data=creds, timeout=15)
if not game_uploads_req.ok:
raise ItchDownloadError(f"Could not fetch game uploads for {label}: {game_uploads_req.text}")
game_uploads = game_uploads_req.json()['uploads']
print(f"Found {len(game_uploads)} upload(s)")
try: try:
status, message = download_title(game_id, url, download_path) for upload in game_uploads:
print(f"{title}: {status}, {message}") upload_id = upload['id']
file_name = upload['filename']
file_size = upload['size']
upload_is_external = upload['storage'] == 'external'
if status == ItchDownloadResult.SUCCESS: print(f"Downloading '{file_name}' ({upload_id}), {file_size} bytes...")
jobs_successful.append((title, download_path)) if upload_is_external:
else: print("***********************************************************")
jobs_failed.append((status, title, url, message)) print("* *")
except Exception as e: print("* WARNING: External storage - downloads will likely fail. *")
print(f"Download failed for {title} ({game_id}): {e}") print("* Check the URL displayed below manually! *")
traceback.print_exc() print("* *")
print("***********************************************************")
target_path = os.path.join(download_path, file_name)
try:
download_file(client, upload_id, target_path, print_url=upload_is_external)
except ItchDownloadError as e:
jobs_failed.append((game_id, file_name, str(e)))
print(f"Download failed for {file_name}: {e}")
continue continue
print(f"\nAll done, downloaded files successfully for {len(jobs_successful)} title(s):") try:
for title, download_path in jobs_successful: actual_file_size = os.stat(target_path).st_size
print(title) if actual_file_size == file_size:
jobs_successful.append((game_id, file_name))
else:
jobs_failed.append((game_id, file_name, f"File size is {actual_file_size}, expected {file_size}"))
except FileNotFoundError:
jobs_failed.append((game_id, file_name, "Could not download file"))
print(f"\nDownloads failed for {len(jobs_failed)} title(s):") print(f"Done downloading {label}")
for status, title, url, message in jobs_failed: except ItchDownloadError as e:
print(f"{title} - {url} - {status}: {message}") failed_game_ids.append((game_id, str(e)))
print(message)
continue
except Exception as e:
print(f"Critical error while downloading {label}: {e}")
failed_game_ids.append((game_id, str(e)))
traceback.print_exc()
print(message)
continue
successful_titles = {}
for game_id, file_name in jobs_successful:
if game_id not in successful_titles:
successful_titles[game_id] = [file_name]
if any(successful_titles):
print(f"\nAll done, downloaded files for {len(successful_titles)} title(s):")
for game_id, files in successful_titles.items():
print(f"{game_id_to_meta[game_id][0]}, {len(files)} file(s)")
if any(jobs_failed):
print(f"\nDownloads failed for {len(jobs_failed)} file(s):")
for game_id, file_name, message in jobs_failed:
title, url = game_id_to_meta[game_id]
print(f"{title} - {file_name} - {message}")
print(f"Title URL: {url}")
if any(failed_game_ids):
print(f"\nCompletely failed downloads for {len(failed_game_ids)} titles:")
for game_id, message in failed_game_ids:
title, url = game_id_to_meta[game_id]
print(f"{title} ({game_id}) - {url} - {message}")
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Downloads games from public Itch.io game jams.") parser = argparse.ArgumentParser(description="Downloads games from public Itch.io game jams.")
parser.add_argument("entries", help="path to the game jam entries.json file") parser.add_argument("entries", help="path to the game jam entries.json file")
parser.add_argument("--continue-from", metavar="ID", help="skip all entries until the provided entry ID is found") parser.add_argument("--api-key", metavar="key", required=True, help="itch.io API key from https://itch.io/user/settings/api-keys")
parser.add_argument("--download-to", metavar="path", help="directory to save results into (default: current dir)")
parser.add_argument("--continue-from", metavar="id", help="skip all entries until the provided entry ID is found")
args = parser.parse_args() args = parser.parse_args()
continue_id = args.continue_from continue_id = args.continue_from
@ -238,4 +302,9 @@ if __name__ == "__main__":
print("ID to continue from must be an integer.") print("ID to continue from must be an integer.")
exit(1) exit(1)
download_jam(args.entries, continue_from=continue_id) download_to = os.getcwd()
if args.download_to is not None:
download_to = os.path.normpath(args.download_to)
os.makedirs(download_to)
download_jam(args.entries, download_to, args.api_key, continue_from=continue_id)

View File

@ -1,3 +1,2 @@
selenium
requests requests
python-slugify python-slugify