From 5d0b8e1e999b7e2489ab1c3ec00dc66b972e47fd Mon Sep 17 00:00:00 2001 From: Ryszard Knop Date: Sat, 2 Oct 2021 03:05:29 +0200 Subject: [PATCH] Trial The First: Maybe It Works This Time --- README.md | 32 ++++++- downloader.py | 241 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 3 + 3 files changed, 274 insertions(+), 2 deletions(-) create mode 100644 downloader.py create mode 100644 requirements.txt diff --git a/README.md b/README.md index e83be82..4e633bb 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,30 @@ -# ItchJamDownloader -Download all games from a public Itch.io Game Jam +# Itch Jam Downloader + +Downloads all games from a public Itch.io Game Jam. + +What you'll need: + +- Python 3.8+ +- `pip install -r requirements.txt` +- [chromedriver](https://chromedriver.chromium.org/downloads) somewhere in your PATH + +On Arch, `pacman -S python chromium python-selenium python-requests python-slugify` works. + +How to use this: + +- Go to your jam's page, ex. https://itch.io/jam/gbcompo21 and right-click -> View Source. +- Ctrl-F for `"id":` - it should find that text once, followed by a number. Write it down. +- Download https://itch.io/jam/NUMBER/entries.json (replacing NUMBER with what you wrote down) +- Run the downloader: `python downloader.py entries.json` +- Wait. This is going to take a while. + +**This downloader does not (and probably will not) support HTML5-only games.** (For some of +these, you might get lucky by hitting F12 while the game loads and grabbing what's in there.) + +It's expected that the downloader output will not be complete - logs are stupidly verbose and +it prints a report on successful/failed downloads, so you must manually grab whatever was not +handled for you automatically for some reason. + +The downloader also grabs the entry page HTML, which usually comes with controls and such. It +does not download images, external assets and so on, just the text - if the Itch page dies, +so will most elements on those downloaded pages. Controls should survive, though. diff --git a/downloader.py b/downloader.py new file mode 100644 index 0000000..37f093b --- /dev/null +++ b/downloader.py @@ -0,0 +1,241 @@ +#!/usr/bin/env python3 +# Python 3.8+ and dependencies listed below required. +import os +import sys +import json +import time +import hashlib +import argparse +import traceback +from enum import Enum +from multiprocessing import Pool + +import requests +from slugify import slugify + +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.common.exceptions import NoSuchElementException, TimeoutException + + +class ItchDownloadResult(Enum): + SUCCESS = 0 + FAILURE = 1 + MISSING_DOWNLOAD = 2 + DOWNLOAD_TIMEOUT = 3 + + +def parse_jobs(jam_json: dict) -> list[tuple[int, str, str]]: + if 'jam_games' not in jam_json: + raise Exception("Provided JSON is not a valid itch.io jam JSON.") + + # Extract (id, url) pairs from all the entries. + return [(e['game']['id'], e['game']['title'], e['game']['url']) for e in jam_json['jam_games']] + + +def try_extract_download_links(driver: webdriver.Chrome, title_url: str) -> list[str]: + elems = driver.find_elements(By.CLASS_NAME, "download_btn") + if len(elems) == 0: + raise NoSuchElementException("No download links found.") + + cookie = driver.get_cookie("itchio_token")['value'] + file_ids = [elem.get_attribute("data-upload_id") for elem in elems] + file_urls = [] + + for file_id in file_ids: + meta_url = f"{title_url}/file/{file_id}" + r = requests.post(meta_url, data={"csrf_token": cookie}) + if r.ok: + file_urls.append(r.json()['url']) + else: + print(f"Error downloading metadata for file {file_id} (status {r.status_code}): {r.text}") + + print(f"Extracted URLs: {file_urls}") + return file_urls + + +def download_link(link: str, path: str) -> tuple[bool, str]: + r = requests.get(link) + if not r.ok: + return (False, r.reason) + + # The bytes we need: + content = r.content + + # Figure out the filename: + if 'Content-Disposition' in r.headers: + name = r.headers['Content-Disposition'] + name = name.removeprefix('attachment; filename="').removesuffix('"') + else: # uhhhh random bullshit go, good luck? + md5 = hashlib.md5() + md5.update(content) + name = md5.hexdigest() + + # Make sure we don't overwrite files with the same name. + fcounter = 1 + filename = f"{path}/{name}" + while os.path.exists(filename): + fcounter += 1 + filename = f"{path}/{name}.{fcounter}" + + try: + with open(filename, 'wb') as f: + f.write(content) + except Exception as e: + return (False, f"Cannot write output file: {e}") + + return (True, "Success") + + +def download_files(links, path) -> list[tuple[bool, str]]: + if len(links) == 0: + print(f"Nothing to download into {path}") + return [] + + with Pool(len(links)) as p: + results = p.starmap(download_link, [(link, path) for link in links]) + return results + + +def parse_download_results(results, method) -> tuple[ItchDownloadResult, str]: + global_success = True + for success, reason in results: + if not success: + print(f"Download failed: {reason}") + global_success = False + + if global_success: + return (ItchDownloadResult.SUCCESS, f"Method #{method} successful.") + else: + return (ItchDownloadResult.FAILURE, f"Method #{method} partially successful (downloads failed).") + + +def download_title(title_id: int, title_url: str, download_path: str) -> (ItchDownloadResult, str): + options = Options() + options.add_argument("--headless") + + with webdriver.Chrome(options=options) as driver: + wait = WebDriverWait(driver, timeout=15) + driver.get(title_url) + + with open(f"{download_path}/index.html", 'w') as f: + f.write(driver.page_source) + + skip_purchase_locator = (By.CLASS_NAME, "direct_download_btn") + + try: + print("Trying method #1: Purchase Workflow") + elem = driver.find_element(By.CLASS_NAME, "buy_btn") + elem.click() + + elem = wait.until(EC.presence_of_element_located(skip_purchase_locator)) + elem.click() + + wait.until(EC.number_of_windows_to_be(2)) + time.sleep(1) + + first_tab = driver.current_window_handle + for window_handle in driver.window_handles: + if window_handle != first_tab: + driver.switch_to.window(window_handle) + break + + # We're now on the main downloads page. + download_links = try_extract_download_links(driver, title_url) + results = download_files(download_links, download_path) + return parse_download_results(results, 1) + except TimeoutException: + print("Method #1 took too long - sleeping for 1m to avoid ~ mystery funsies ~") + time.sleep(60) + + return ItchDownloadResult.DOWNLOAD_TIMEOUT, "Download timed out" + except NoSuchElementException: + print("Method #1 failed.") + + try: + print("Trying method #2: Direct Download Workflow") + download_links = try_extract_download_links(driver, title_url) + results = download_files(download_links, download_path) + return parse_download_results(results, 2) + except NoSuchElementException: + print("Method #2 failed.") + + print("File links missing/no method able to handle target URL.") + return ItchDownloadResult.MISSING_DOWNLOAD, "No download method worked." + +def download_jam(path_to_json: str, continue_from: str=None): + try: + with open(path_to_json) as f: + jam_json = json.load(f) + except FileNotFoundError: + print(f"File {path_to_json} not found.") + except json.decoder.JSONDecodeError: + print(F"Provided file is not a valid JSON file.") + + jobs = parse_jobs(jam_json) + jobs_successful = [] + jobs_failed = [] + + # No "continue from"? Yep, start right away. + should_process_jobs = continue_from is None + + for job in jobs: + game_id, title, url = job + if not should_process_jobs: + if game_id == continue_from: + should_process_jobs = True + else: + continue + + r = requests.get(f"{url}/data.json") + if r.status_code != 200: + print(f"Missing data for {url}, probably invalid") + failed_jobs += url + continue + + download_path = os.path.join(os.getcwd(), slugify(title)) + print(f"Trying to download {title} ({game_id}) to {download_path}") + + if not os.path.isdir(download_path): + os.mkdir(download_path) + + try: + status, message = download_title(game_id, url, download_path) + print(f"{title}: {status}, {message}") + + if status == ItchDownloadResult.SUCCESS: + jobs_successful.append((title, download_path)) + else: + jobs_failed.append((status, title, url, message)) + except Exception as e: + print(f"Download failed for {title} ({game_id}): {e}") + traceback.print_exc() + continue + + print(f"\nAll done, downloaded files successfully for {len(jobs_successful)} title(s):") + for title, download_path in jobs_successful: + print(title) + + print(f"\nDownloads failed for {len(jobs_failed)} title(s):") + for status, title, url, message in jobs_failed: + print(f"{title} - {url} - {status}: {message}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Downloads games from public Itch.io game jams.") + parser.add_argument("entries", help="path to the game jam entries.json file") + parser.add_argument("--continue-from", metavar="ID", help="skip all entries until the provided entry ID is found") + args = parser.parse_args() + + continue_id = args.continue_from + if continue_id is not None: + try: + continue_id = int(continue_id) + except: + print("ID to continue from must be an integer.") + exit(1) + + download_jam(args.entries, continue_from=continue_id) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8a75348 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +selenium +requests +python-slugify