Trial The First: Maybe It Works This Time

This commit is contained in:
Ryszard Knop 2021-10-02 03:05:29 +02:00
parent bdd95f62a8
commit 5d0b8e1e99
3 changed files with 274 additions and 2 deletions

View File

@ -1,2 +1,30 @@
# ItchJamDownloader
Download all games from a public Itch.io Game Jam
# Itch Jam Downloader
Downloads all games from a public Itch.io Game Jam.
What you'll need:
- Python 3.8+
- `pip install -r requirements.txt`
- [chromedriver](https://chromedriver.chromium.org/downloads) somewhere in your PATH
On Arch, `pacman -S python chromium python-selenium python-requests python-slugify` works.
How to use this:
- Go to your jam's page, ex. https://itch.io/jam/gbcompo21 and right-click -> View Source.
- Ctrl-F for `"id":` - it should find that text once, followed by a number. Write it down.
- Download https://itch.io/jam/NUMBER/entries.json (replacing NUMBER with what you wrote down)
- Run the downloader: `python downloader.py entries.json`
- Wait. This is going to take a while.
**This downloader does not (and probably will not) support HTML5-only games.** (For some of
these, you might get lucky by hitting F12 while the game loads and grabbing what's in there.)
It's expected that the downloader output will not be complete - logs are stupidly verbose and
it prints a report on successful/failed downloads, so you must manually grab whatever was not
handled for you automatically for some reason.
The downloader also grabs the entry page HTML, which usually comes with controls and such. It
does not download images, external assets and so on, just the text - if the Itch page dies,
so will most elements on those downloaded pages. Controls should survive, though.

241
downloader.py Normal file
View File

@ -0,0 +1,241 @@
#!/usr/bin/env python3
# Python 3.8+ and dependencies listed below required.
import os
import sys
import json
import time
import hashlib
import argparse
import traceback
from enum import Enum
from multiprocessing import Pool
import requests
from slugify import slugify
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
class ItchDownloadResult(Enum):
SUCCESS = 0
FAILURE = 1
MISSING_DOWNLOAD = 2
DOWNLOAD_TIMEOUT = 3
def parse_jobs(jam_json: dict) -> list[tuple[int, str, str]]:
if 'jam_games' not in jam_json:
raise Exception("Provided JSON is not a valid itch.io jam JSON.")
# Extract (id, url) pairs from all the entries.
return [(e['game']['id'], e['game']['title'], e['game']['url']) for e in jam_json['jam_games']]
def try_extract_download_links(driver: webdriver.Chrome, title_url: str) -> list[str]:
elems = driver.find_elements(By.CLASS_NAME, "download_btn")
if len(elems) == 0:
raise NoSuchElementException("No download links found.")
cookie = driver.get_cookie("itchio_token")['value']
file_ids = [elem.get_attribute("data-upload_id") for elem in elems]
file_urls = []
for file_id in file_ids:
meta_url = f"{title_url}/file/{file_id}"
r = requests.post(meta_url, data={"csrf_token": cookie})
if r.ok:
file_urls.append(r.json()['url'])
else:
print(f"Error downloading metadata for file {file_id} (status {r.status_code}): {r.text}")
print(f"Extracted URLs: {file_urls}")
return file_urls
def download_link(link: str, path: str) -> tuple[bool, str]:
r = requests.get(link)
if not r.ok:
return (False, r.reason)
# The bytes we need:
content = r.content
# Figure out the filename:
if 'Content-Disposition' in r.headers:
name = r.headers['Content-Disposition']
name = name.removeprefix('attachment; filename="').removesuffix('"')
else: # uhhhh random bullshit go, good luck?
md5 = hashlib.md5()
md5.update(content)
name = md5.hexdigest()
# Make sure we don't overwrite files with the same name.
fcounter = 1
filename = f"{path}/{name}"
while os.path.exists(filename):
fcounter += 1
filename = f"{path}/{name}.{fcounter}"
try:
with open(filename, 'wb') as f:
f.write(content)
except Exception as e:
return (False, f"Cannot write output file: {e}")
return (True, "Success")
def download_files(links, path) -> list[tuple[bool, str]]:
if len(links) == 0:
print(f"Nothing to download into {path}")
return []
with Pool(len(links)) as p:
results = p.starmap(download_link, [(link, path) for link in links])
return results
def parse_download_results(results, method) -> tuple[ItchDownloadResult, str]:
global_success = True
for success, reason in results:
if not success:
print(f"Download failed: {reason}")
global_success = False
if global_success:
return (ItchDownloadResult.SUCCESS, f"Method #{method} successful.")
else:
return (ItchDownloadResult.FAILURE, f"Method #{method} partially successful (downloads failed).")
def download_title(title_id: int, title_url: str, download_path: str) -> (ItchDownloadResult, str):
options = Options()
options.add_argument("--headless")
with webdriver.Chrome(options=options) as driver:
wait = WebDriverWait(driver, timeout=15)
driver.get(title_url)
with open(f"{download_path}/index.html", 'w') as f:
f.write(driver.page_source)
skip_purchase_locator = (By.CLASS_NAME, "direct_download_btn")
try:
print("Trying method #1: Purchase Workflow")
elem = driver.find_element(By.CLASS_NAME, "buy_btn")
elem.click()
elem = wait.until(EC.presence_of_element_located(skip_purchase_locator))
elem.click()
wait.until(EC.number_of_windows_to_be(2))
time.sleep(1)
first_tab = driver.current_window_handle
for window_handle in driver.window_handles:
if window_handle != first_tab:
driver.switch_to.window(window_handle)
break
# We're now on the main downloads page.
download_links = try_extract_download_links(driver, title_url)
results = download_files(download_links, download_path)
return parse_download_results(results, 1)
except TimeoutException:
print("Method #1 took too long - sleeping for 1m to avoid ~ mystery funsies ~")
time.sleep(60)
return ItchDownloadResult.DOWNLOAD_TIMEOUT, "Download timed out"
except NoSuchElementException:
print("Method #1 failed.")
try:
print("Trying method #2: Direct Download Workflow")
download_links = try_extract_download_links(driver, title_url)
results = download_files(download_links, download_path)
return parse_download_results(results, 2)
except NoSuchElementException:
print("Method #2 failed.")
print("File links missing/no method able to handle target URL.")
return ItchDownloadResult.MISSING_DOWNLOAD, "No download method worked."
def download_jam(path_to_json: str, continue_from: str=None):
try:
with open(path_to_json) as f:
jam_json = json.load(f)
except FileNotFoundError:
print(f"File {path_to_json} not found.")
except json.decoder.JSONDecodeError:
print(F"Provided file is not a valid JSON file.")
jobs = parse_jobs(jam_json)
jobs_successful = []
jobs_failed = []
# No "continue from"? Yep, start right away.
should_process_jobs = continue_from is None
for job in jobs:
game_id, title, url = job
if not should_process_jobs:
if game_id == continue_from:
should_process_jobs = True
else:
continue
r = requests.get(f"{url}/data.json")
if r.status_code != 200:
print(f"Missing data for {url}, probably invalid")
failed_jobs += url
continue
download_path = os.path.join(os.getcwd(), slugify(title))
print(f"Trying to download {title} ({game_id}) to {download_path}")
if not os.path.isdir(download_path):
os.mkdir(download_path)
try:
status, message = download_title(game_id, url, download_path)
print(f"{title}: {status}, {message}")
if status == ItchDownloadResult.SUCCESS:
jobs_successful.append((title, download_path))
else:
jobs_failed.append((status, title, url, message))
except Exception as e:
print(f"Download failed for {title} ({game_id}): {e}")
traceback.print_exc()
continue
print(f"\nAll done, downloaded files successfully for {len(jobs_successful)} title(s):")
for title, download_path in jobs_successful:
print(title)
print(f"\nDownloads failed for {len(jobs_failed)} title(s):")
for status, title, url, message in jobs_failed:
print(f"{title} - {url} - {status}: {message}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Downloads games from public Itch.io game jams.")
parser.add_argument("entries", help="path to the game jam entries.json file")
parser.add_argument("--continue-from", metavar="ID", help="skip all entries until the provided entry ID is found")
args = parser.parse_args()
continue_id = args.continue_from
if continue_id is not None:
try:
continue_id = int(continue_id)
except:
print("ID to continue from must be an integer.")
exit(1)
download_jam(args.entries, continue_from=continue_id)

3
requirements.txt Normal file
View File

@ -0,0 +1,3 @@
selenium
requests
python-slugify