mirror of
https://github.com/DragoonAethis/itch-dl.git
synced 2025-01-22 01:41:11 +01:00
Trial The Second: Make leafo happy by not murdering his servers
Drop Selenium and clicking around itch - use the API instead. Pulls HTML5 games, is way faster, uses less resources on both ends.
This commit is contained in:
parent
5d0b8e1e99
commit
f86044050f
13
README.md
13
README.md
@ -6,20 +6,21 @@ What you'll need:
|
||||
|
||||
- Python 3.8+
|
||||
- `pip install -r requirements.txt`
|
||||
- [chromedriver](https://chromedriver.chromium.org/downloads) somewhere in your PATH
|
||||
- For site mirroring, [wget](https://www.gnu.org/software/wget/) in your PATH.
|
||||
|
||||
On Arch, `pacman -S python chromium python-selenium python-requests python-slugify` works.
|
||||
On Arch, `pacman -S wget python python-requests python-slugify` works.
|
||||
|
||||
How to use this:
|
||||
|
||||
- Go to your jam's page, ex. https://itch.io/jam/gbcompo21 and right-click -> View Source.
|
||||
- Ctrl-F for `"id":` - it should find that text once, followed by a number. Write it down.
|
||||
- (It you found it multiple times, grab the one after ViewJam something something.)
|
||||
- Download https://itch.io/jam/NUMBER/entries.json (replacing NUMBER with what you wrote down)
|
||||
- Run the downloader: `python downloader.py entries.json`
|
||||
- Generate a new API key on your user account page: https://itch.io/user/settings/api-keys
|
||||
- Run the downloader: `python downloader.py --api-key <KEY> entries.json`
|
||||
- Wait. This is going to take a while.
|
||||
|
||||
**This downloader does not (and probably will not) support HTML5-only games.** (For some of
|
||||
these, you might get lucky by hitting F12 while the game loads and grabbing what's in there.)
|
||||
The downloader is able to grab more or less everything you can download via the itch app.
|
||||
|
||||
It's expected that the downloader output will not be complete - logs are stupidly verbose and
|
||||
it prints a report on successful/failed downloads, so you must manually grab whatever was not
|
||||
@ -28,3 +29,5 @@ handled for you automatically for some reason.
|
||||
The downloader also grabs the entry page HTML, which usually comes with controls and such. It
|
||||
does not download images, external assets and so on, just the text - if the Itch page dies,
|
||||
so will most elements on those downloaded pages. Controls should survive, though.
|
||||
|
||||
(There's a pedantic mirroring toggle in the script, if you know what you're doing though.)
|
||||
|
401
downloader.py
Normal file → Executable file
401
downloader.py
Normal file → Executable file
@ -4,21 +4,28 @@ import os
|
||||
import sys
|
||||
import json
|
||||
import time
|
||||
import shutil
|
||||
import hashlib
|
||||
import argparse
|
||||
import traceback
|
||||
import subprocess
|
||||
from enum import Enum
|
||||
from multiprocessing import Pool
|
||||
|
||||
import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from requests.packages.urllib3.util.retry import Retry
|
||||
|
||||
from slugify import slugify
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import NoSuchElementException, TimeoutException
|
||||
WGET_PATH = shutil.which("wget")
|
||||
if WGET_PATH is None:
|
||||
print(f"Warning: wget not available, site mirroring will not work!")
|
||||
|
||||
# Try to download all site assets, images etc included.
|
||||
# You probably don't want this, but here you go!
|
||||
PEDANTIC_MIRRORING = False
|
||||
|
||||
ITCH_API = "https://api.itch.io"
|
||||
|
||||
|
||||
class ItchDownloadResult(Enum):
|
||||
@ -28,206 +35,263 @@ class ItchDownloadResult(Enum):
|
||||
DOWNLOAD_TIMEOUT = 3
|
||||
|
||||
|
||||
class ItchDownloadError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class ItchApiClient():
|
||||
def __init__(self, base_url: str, api_key: str):
|
||||
self.base_url = base_url
|
||||
self.api_key = api_key
|
||||
|
||||
self.requests = requests.Session()
|
||||
|
||||
retry_strategy = Retry(
|
||||
total=5,
|
||||
backoff_factor=10,
|
||||
allowed_methods=["HEAD", "GET"],
|
||||
status_forcelist=[429, 500, 502, 503, 504]
|
||||
)
|
||||
|
||||
# No timeouts - set them explicitly on API calls below!
|
||||
adapter = HTTPAdapter(max_retries=retry_strategy)
|
||||
self.requests.mount("https://", adapter)
|
||||
self.requests.mount("http://", adapter)
|
||||
|
||||
def add_api_key(self, kwargs):
|
||||
# Adds the API key to request params, if one was not
|
||||
# already provided outside of the client.
|
||||
if 'data' in kwargs:
|
||||
params = kwargs['data']
|
||||
else:
|
||||
params = {}
|
||||
kwargs['data'] = params
|
||||
|
||||
if 'api_key' not in params:
|
||||
params['api_key'] = self.api_key
|
||||
|
||||
def get(self, endpoint: str, *args, **kwargs):
|
||||
self.add_api_key(kwargs)
|
||||
return self.requests.get(self.base_url + endpoint, *args, **kwargs)
|
||||
|
||||
|
||||
def download_file(client: ItchApiClient, upload_id: int, download_path: str, print_url: bool=False):
|
||||
# No timeouts, chunked uploads, default retry strategy, should be all good?
|
||||
try:
|
||||
with client.get(f"/uploads/{upload_id}/download", stream=True) as r:
|
||||
r.raise_for_status()
|
||||
if print_url:
|
||||
print(f"Download URL: {r.url}")
|
||||
|
||||
with open(download_path, 'wb') as f:
|
||||
for chunk in r.iter_content(chunk_size=1048576): # 1MB chunks
|
||||
f.write(chunk)
|
||||
except requests.exceptions.HTTPError as e:
|
||||
raise ItchDownloadError(f"Unrecoverable download error: {e}")
|
||||
|
||||
|
||||
def get_download_keys(client: ItchApiClient):
|
||||
print("Fetching all download keys...")
|
||||
download_keys = {}
|
||||
page = 1
|
||||
|
||||
while True:
|
||||
print(f"Downloading page {page}...")
|
||||
try:
|
||||
r = client.get("/profile/owned-keys", data={"page": page}, timeout=15)
|
||||
r.raise_for_status()
|
||||
except Exception as e:
|
||||
print(f"Got error while fetching download keys: {e}")
|
||||
print(f"Let's just pretend this is enough and move on...")
|
||||
break
|
||||
|
||||
data = r.json()
|
||||
if 'owned_keys' not in data:
|
||||
break # Assuming we're out of keys already...
|
||||
|
||||
for key in data['owned_keys']:
|
||||
download_keys[key['game_id']] = key['id']
|
||||
|
||||
if len(data['owned_keys']) == data['per_page']:
|
||||
page += 1
|
||||
else:
|
||||
break
|
||||
|
||||
print(f"Fetched {len(download_keys)} download keys.")
|
||||
return download_keys
|
||||
|
||||
|
||||
def parse_jobs(jam_json: dict) -> list[tuple[int, str, str]]:
|
||||
if 'jam_games' not in jam_json:
|
||||
raise Exception("Provided JSON is not a valid itch.io jam JSON.")
|
||||
|
||||
# Extract (id, url) pairs from all the entries.
|
||||
return [(e['game']['id'], e['game']['title'], e['game']['url']) for e in jam_json['jam_games']]
|
||||
return [(int(e['game']['id']), e['game']['title'], e['game']['url']) for e in jam_json['jam_games']]
|
||||
|
||||
|
||||
def try_extract_download_links(driver: webdriver.Chrome, title_url: str) -> list[str]:
|
||||
elems = driver.find_elements(By.CLASS_NAME, "download_btn")
|
||||
if len(elems) == 0:
|
||||
raise NoSuchElementException("No download links found.")
|
||||
|
||||
cookie = driver.get_cookie("itchio_token")['value']
|
||||
file_ids = [elem.get_attribute("data-upload_id") for elem in elems]
|
||||
file_urls = []
|
||||
|
||||
for file_id in file_ids:
|
||||
meta_url = f"{title_url}/file/{file_id}"
|
||||
r = requests.post(meta_url, data={"csrf_token": cookie})
|
||||
if r.ok:
|
||||
file_urls.append(r.json()['url'])
|
||||
else:
|
||||
print(f"Error downloading metadata for file {file_id} (status {r.status_code}): {r.text}")
|
||||
|
||||
print(f"Extracted URLs: {file_urls}")
|
||||
return file_urls
|
||||
|
||||
|
||||
def download_link(link: str, path: str) -> tuple[bool, str]:
|
||||
r = requests.get(link)
|
||||
if not r.ok:
|
||||
return (False, r.reason)
|
||||
|
||||
# The bytes we need:
|
||||
content = r.content
|
||||
|
||||
# Figure out the filename:
|
||||
if 'Content-Disposition' in r.headers:
|
||||
name = r.headers['Content-Disposition']
|
||||
name = name.removeprefix('attachment; filename="').removesuffix('"')
|
||||
else: # uhhhh random bullshit go, good luck?
|
||||
md5 = hashlib.md5()
|
||||
md5.update(content)
|
||||
name = md5.hexdigest()
|
||||
|
||||
# Make sure we don't overwrite files with the same name.
|
||||
fcounter = 1
|
||||
filename = f"{path}/{name}"
|
||||
while os.path.exists(filename):
|
||||
fcounter += 1
|
||||
filename = f"{path}/{name}.{fcounter}"
|
||||
|
||||
try:
|
||||
with open(filename, 'wb') as f:
|
||||
f.write(content)
|
||||
except Exception as e:
|
||||
return (False, f"Cannot write output file: {e}")
|
||||
|
||||
return (True, "Success")
|
||||
|
||||
|
||||
def download_files(links, path) -> list[tuple[bool, str]]:
|
||||
if len(links) == 0:
|
||||
print(f"Nothing to download into {path}")
|
||||
return []
|
||||
|
||||
with Pool(len(links)) as p:
|
||||
results = p.starmap(download_link, [(link, path) for link in links])
|
||||
return results
|
||||
|
||||
|
||||
def parse_download_results(results, method) -> tuple[ItchDownloadResult, str]:
|
||||
global_success = True
|
||||
for success, reason in results:
|
||||
if not success:
|
||||
print(f"Download failed: {reason}")
|
||||
global_success = False
|
||||
|
||||
if global_success:
|
||||
return (ItchDownloadResult.SUCCESS, f"Method #{method} successful.")
|
||||
else:
|
||||
return (ItchDownloadResult.FAILURE, f"Method #{method} partially successful (downloads failed).")
|
||||
|
||||
|
||||
def download_title(title_id: int, title_url: str, download_path: str) -> (ItchDownloadResult, str):
|
||||
options = Options()
|
||||
options.add_argument("--headless")
|
||||
|
||||
with webdriver.Chrome(options=options) as driver:
|
||||
wait = WebDriverWait(driver, timeout=15)
|
||||
driver.get(title_url)
|
||||
|
||||
with open(f"{download_path}/index.html", 'w') as f:
|
||||
f.write(driver.page_source)
|
||||
|
||||
skip_purchase_locator = (By.CLASS_NAME, "direct_download_btn")
|
||||
|
||||
try:
|
||||
print("Trying method #1: Purchase Workflow")
|
||||
elem = driver.find_element(By.CLASS_NAME, "buy_btn")
|
||||
elem.click()
|
||||
|
||||
elem = wait.until(EC.presence_of_element_located(skip_purchase_locator))
|
||||
elem.click()
|
||||
|
||||
wait.until(EC.number_of_windows_to_be(2))
|
||||
time.sleep(1)
|
||||
|
||||
first_tab = driver.current_window_handle
|
||||
for window_handle in driver.window_handles:
|
||||
if window_handle != first_tab:
|
||||
driver.switch_to.window(window_handle)
|
||||
break
|
||||
|
||||
# We're now on the main downloads page.
|
||||
download_links = try_extract_download_links(driver, title_url)
|
||||
results = download_files(download_links, download_path)
|
||||
return parse_download_results(results, 1)
|
||||
except TimeoutException:
|
||||
print("Method #1 took too long - sleeping for 1m to avoid ~ mystery funsies ~")
|
||||
time.sleep(60)
|
||||
|
||||
return ItchDownloadResult.DOWNLOAD_TIMEOUT, "Download timed out"
|
||||
except NoSuchElementException:
|
||||
print("Method #1 failed.")
|
||||
|
||||
try:
|
||||
print("Trying method #2: Direct Download Workflow")
|
||||
download_links = try_extract_download_links(driver, title_url)
|
||||
results = download_files(download_links, download_path)
|
||||
return parse_download_results(results, 2)
|
||||
except NoSuchElementException:
|
||||
print("Method #2 failed.")
|
||||
|
||||
print("File links missing/no method able to handle target URL.")
|
||||
return ItchDownloadResult.MISSING_DOWNLOAD, "No download method worked."
|
||||
|
||||
def download_jam(path_to_json: str, continue_from: str=None):
|
||||
def download_jam(path_to_json: str, download_to: str, api_key: str, continue_from: str=None):
|
||||
try:
|
||||
with open(path_to_json) as f:
|
||||
jam_json = json.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"File {path_to_json} not found.")
|
||||
except json.decoder.JSONDecodeError:
|
||||
print(F"Provided file is not a valid JSON file.")
|
||||
print(F"Provided entries file is not a valid JSON file.")
|
||||
|
||||
client = ItchApiClient(ITCH_API, api_key)
|
||||
|
||||
# Check API key validity:
|
||||
profile_req = client.get("/profile")
|
||||
if not profile_req.ok:
|
||||
print(f"Provided API key appears to be invalid: {profile_req.text}")
|
||||
exit(1)
|
||||
|
||||
jobs = parse_jobs(jam_json)
|
||||
jobs_successful = []
|
||||
jobs_failed = []
|
||||
|
||||
download_keys = get_download_keys(client)
|
||||
game_id_to_meta = {} # dict[game_id: int, (title: str, url: str)]
|
||||
|
||||
for game_id, title, url in jobs:
|
||||
game_id_to_meta[game_id] = (title, url)
|
||||
|
||||
failed_game_ids = set()
|
||||
|
||||
# No "continue from"? Yep, start right away.
|
||||
should_process_jobs = continue_from is None
|
||||
|
||||
for job in jobs:
|
||||
game_id, title, url = job
|
||||
for game_id, title, url in jobs:
|
||||
label = f"{title} ({game_id})"
|
||||
if not should_process_jobs:
|
||||
if game_id == continue_from:
|
||||
should_process_jobs = True
|
||||
else:
|
||||
continue
|
||||
|
||||
r = requests.get(f"{url}/data.json")
|
||||
if r.status_code != 200:
|
||||
print(f"Missing data for {url}, probably invalid")
|
||||
failed_jobs += url
|
||||
continue
|
||||
try:
|
||||
download_path = os.path.join(download_to, slugify(title))
|
||||
if PEDANTIC_MIRRORING:
|
||||
site_mirror_path = os.path.join(download_to, "_sites")
|
||||
else:
|
||||
site_mirror_path = os.path.join(download_path, "site")
|
||||
os.makedirs(download_path, exist_ok=True)
|
||||
os.makedirs(site_mirror_path, exist_ok=True)
|
||||
except:
|
||||
raise ItchDownloadError(f"Could not create download directory: {download_path}")
|
||||
|
||||
download_path = os.path.join(os.getcwd(), slugify(title))
|
||||
print(f"Trying to download {title} ({game_id}) to {download_path}")
|
||||
print(f"Trying to download {label} to {download_path}")
|
||||
|
||||
if not os.path.isdir(download_path):
|
||||
os.mkdir(download_path)
|
||||
if WGET_PATH is not None:
|
||||
print("Downloading site...")
|
||||
if PEDANTIC_MIRRORING:
|
||||
extra_wget_args = [
|
||||
"--timestamping",
|
||||
"--span-hosts",
|
||||
"--convert-links",
|
||||
"--adjust-extension",
|
||||
"--page-requisites",
|
||||
]
|
||||
else:
|
||||
extra_wget_args = []
|
||||
|
||||
wget = subprocess.run([
|
||||
WGET_PATH,
|
||||
*extra_wget_args,
|
||||
"--quiet",
|
||||
url
|
||||
], cwd=site_mirror_path)
|
||||
|
||||
if wget.returncode != 0:
|
||||
print(f"Warning: Site mirroring failed/incomplete.")
|
||||
|
||||
creds = {}
|
||||
if game_id in download_keys:
|
||||
creds['download_key_id'] = download_keys[game_id]
|
||||
print("Using {creds} for private uploads")
|
||||
|
||||
game_uploads_req = client.get(f"/games/{game_id}/uploads", data=creds, timeout=15)
|
||||
if not game_uploads_req.ok:
|
||||
raise ItchDownloadError(f"Could not fetch game uploads for {label}: {game_uploads_req.text}")
|
||||
|
||||
game_uploads = game_uploads_req.json()['uploads']
|
||||
print(f"Found {len(game_uploads)} upload(s)")
|
||||
|
||||
try:
|
||||
status, message = download_title(game_id, url, download_path)
|
||||
print(f"{title}: {status}, {message}")
|
||||
for upload in game_uploads:
|
||||
upload_id = upload['id']
|
||||
file_name = upload['filename']
|
||||
file_size = upload['size']
|
||||
upload_is_external = upload['storage'] == 'external'
|
||||
|
||||
if status == ItchDownloadResult.SUCCESS:
|
||||
jobs_successful.append((title, download_path))
|
||||
else:
|
||||
jobs_failed.append((status, title, url, message))
|
||||
print(f"Downloading '{file_name}' ({upload_id}), {file_size} bytes...")
|
||||
if upload_is_external:
|
||||
print("***********************************************************")
|
||||
print("* *")
|
||||
print("* WARNING: External storage - downloads will likely fail. *")
|
||||
print("* Check the URL displayed below manually! *")
|
||||
print("* *")
|
||||
print("***********************************************************")
|
||||
|
||||
target_path = os.path.join(download_path, file_name)
|
||||
try:
|
||||
download_file(client, upload_id, target_path, print_url=upload_is_external)
|
||||
except ItchDownloadError as e:
|
||||
jobs_failed.append((game_id, file_name, str(e)))
|
||||
print(f"Download failed for {file_name}: {e}")
|
||||
continue
|
||||
|
||||
try:
|
||||
actual_file_size = os.stat(target_path).st_size
|
||||
if actual_file_size == file_size:
|
||||
jobs_successful.append((game_id, file_name))
|
||||
else:
|
||||
jobs_failed.append((game_id, file_name, f"File size is {actual_file_size}, expected {file_size}"))
|
||||
except FileNotFoundError:
|
||||
jobs_failed.append((game_id, file_name, "Could not download file"))
|
||||
|
||||
print(f"Done downloading {label}")
|
||||
except ItchDownloadError as e:
|
||||
failed_game_ids.append((game_id, str(e)))
|
||||
print(message)
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f"Download failed for {title} ({game_id}): {e}")
|
||||
print(f"Critical error while downloading {label}: {e}")
|
||||
failed_game_ids.append((game_id, str(e)))
|
||||
traceback.print_exc()
|
||||
print(message)
|
||||
continue
|
||||
|
||||
print(f"\nAll done, downloaded files successfully for {len(jobs_successful)} title(s):")
|
||||
for title, download_path in jobs_successful:
|
||||
print(title)
|
||||
successful_titles = {}
|
||||
for game_id, file_name in jobs_successful:
|
||||
if game_id not in successful_titles:
|
||||
successful_titles[game_id] = [file_name]
|
||||
|
||||
print(f"\nDownloads failed for {len(jobs_failed)} title(s):")
|
||||
for status, title, url, message in jobs_failed:
|
||||
print(f"{title} - {url} - {status}: {message}")
|
||||
if any(successful_titles):
|
||||
print(f"\nAll done, downloaded files for {len(successful_titles)} title(s):")
|
||||
for game_id, files in successful_titles.items():
|
||||
print(f"{game_id_to_meta[game_id][0]}, {len(files)} file(s)")
|
||||
|
||||
if any(jobs_failed):
|
||||
print(f"\nDownloads failed for {len(jobs_failed)} file(s):")
|
||||
for game_id, file_name, message in jobs_failed:
|
||||
title, url = game_id_to_meta[game_id]
|
||||
print(f"{title} - {file_name} - {message}")
|
||||
print(f"Title URL: {url}")
|
||||
|
||||
if any(failed_game_ids):
|
||||
print(f"\nCompletely failed downloads for {len(failed_game_ids)} titles:")
|
||||
for game_id, message in failed_game_ids:
|
||||
title, url = game_id_to_meta[game_id]
|
||||
print(f"{title} ({game_id}) - {url} - {message}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Downloads games from public Itch.io game jams.")
|
||||
parser.add_argument("entries", help="path to the game jam entries.json file")
|
||||
parser.add_argument("--continue-from", metavar="ID", help="skip all entries until the provided entry ID is found")
|
||||
parser.add_argument("--api-key", metavar="key", required=True, help="itch.io API key from https://itch.io/user/settings/api-keys")
|
||||
parser.add_argument("--download-to", metavar="path", help="directory to save results into (default: current dir)")
|
||||
parser.add_argument("--continue-from", metavar="id", help="skip all entries until the provided entry ID is found")
|
||||
args = parser.parse_args()
|
||||
|
||||
continue_id = args.continue_from
|
||||
@ -238,4 +302,9 @@ if __name__ == "__main__":
|
||||
print("ID to continue from must be an integer.")
|
||||
exit(1)
|
||||
|
||||
download_jam(args.entries, continue_from=continue_id)
|
||||
download_to = os.getcwd()
|
||||
if args.download_to is not None:
|
||||
download_to = os.path.normpath(args.download_to)
|
||||
os.makedirs(download_to)
|
||||
|
||||
download_jam(args.entries, download_to, args.api_key, continue_from=continue_id)
|
||||
|
@ -1,3 +1,2 @@
|
||||
selenium
|
||||
requests
|
||||
python-slugify
|
||||
|
Loading…
x
Reference in New Issue
Block a user