itch-dl/downloader.py

242 lines
7.1 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
# Python 3.8+ and dependencies listed below required.
import os
import sys
import json
import time
import hashlib
import argparse
import traceback
from enum import Enum
from multiprocessing import Pool
import requests
from slugify import slugify
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
class ItchDownloadResult(Enum):
SUCCESS = 0
FAILURE = 1
MISSING_DOWNLOAD = 2
DOWNLOAD_TIMEOUT = 3
def parse_jobs(jam_json: dict) -> list[tuple[int, str, str]]:
if 'jam_games' not in jam_json:
raise Exception("Provided JSON is not a valid itch.io jam JSON.")
# Extract (id, url) pairs from all the entries.
return [(e['game']['id'], e['game']['title'], e['game']['url']) for e in jam_json['jam_games']]
def try_extract_download_links(driver: webdriver.Chrome, title_url: str) -> list[str]:
elems = driver.find_elements(By.CLASS_NAME, "download_btn")
if len(elems) == 0:
raise NoSuchElementException("No download links found.")
cookie = driver.get_cookie("itchio_token")['value']
file_ids = [elem.get_attribute("data-upload_id") for elem in elems]
file_urls = []
for file_id in file_ids:
meta_url = f"{title_url}/file/{file_id}"
r = requests.post(meta_url, data={"csrf_token": cookie})
if r.ok:
file_urls.append(r.json()['url'])
else:
print(f"Error downloading metadata for file {file_id} (status {r.status_code}): {r.text}")
print(f"Extracted URLs: {file_urls}")
return file_urls
def download_link(link: str, path: str) -> tuple[bool, str]:
r = requests.get(link)
if not r.ok:
return (False, r.reason)
# The bytes we need:
content = r.content
# Figure out the filename:
if 'Content-Disposition' in r.headers:
name = r.headers['Content-Disposition']
name = name.removeprefix('attachment; filename="').removesuffix('"')
else: # uhhhh random bullshit go, good luck?
md5 = hashlib.md5()
md5.update(content)
name = md5.hexdigest()
# Make sure we don't overwrite files with the same name.
fcounter = 1
filename = f"{path}/{name}"
while os.path.exists(filename):
fcounter += 1
filename = f"{path}/{name}.{fcounter}"
try:
with open(filename, 'wb') as f:
f.write(content)
except Exception as e:
return (False, f"Cannot write output file: {e}")
return (True, "Success")
def download_files(links, path) -> list[tuple[bool, str]]:
if len(links) == 0:
print(f"Nothing to download into {path}")
return []
with Pool(len(links)) as p:
results = p.starmap(download_link, [(link, path) for link in links])
return results
def parse_download_results(results, method) -> tuple[ItchDownloadResult, str]:
global_success = True
for success, reason in results:
if not success:
print(f"Download failed: {reason}")
global_success = False
if global_success:
return (ItchDownloadResult.SUCCESS, f"Method #{method} successful.")
else:
return (ItchDownloadResult.FAILURE, f"Method #{method} partially successful (downloads failed).")
def download_title(title_id: int, title_url: str, download_path: str) -> (ItchDownloadResult, str):
options = Options()
options.add_argument("--headless")
with webdriver.Chrome(options=options) as driver:
wait = WebDriverWait(driver, timeout=15)
driver.get(title_url)
with open(f"{download_path}/index.html", 'w') as f:
f.write(driver.page_source)
skip_purchase_locator = (By.CLASS_NAME, "direct_download_btn")
try:
print("Trying method #1: Purchase Workflow")
elem = driver.find_element(By.CLASS_NAME, "buy_btn")
elem.click()
elem = wait.until(EC.presence_of_element_located(skip_purchase_locator))
elem.click()
wait.until(EC.number_of_windows_to_be(2))
time.sleep(1)
first_tab = driver.current_window_handle
for window_handle in driver.window_handles:
if window_handle != first_tab:
driver.switch_to.window(window_handle)
break
# We're now on the main downloads page.
download_links = try_extract_download_links(driver, title_url)
results = download_files(download_links, download_path)
return parse_download_results(results, 1)
except TimeoutException:
print("Method #1 took too long - sleeping for 1m to avoid ~ mystery funsies ~")
time.sleep(60)
return ItchDownloadResult.DOWNLOAD_TIMEOUT, "Download timed out"
except NoSuchElementException:
print("Method #1 failed.")
try:
print("Trying method #2: Direct Download Workflow")
download_links = try_extract_download_links(driver, title_url)
results = download_files(download_links, download_path)
return parse_download_results(results, 2)
except NoSuchElementException:
print("Method #2 failed.")
print("File links missing/no method able to handle target URL.")
return ItchDownloadResult.MISSING_DOWNLOAD, "No download method worked."
def download_jam(path_to_json: str, continue_from: str=None):
try:
with open(path_to_json) as f:
jam_json = json.load(f)
except FileNotFoundError:
print(f"File {path_to_json} not found.")
except json.decoder.JSONDecodeError:
print(F"Provided file is not a valid JSON file.")
jobs = parse_jobs(jam_json)
jobs_successful = []
jobs_failed = []
# No "continue from"? Yep, start right away.
should_process_jobs = continue_from is None
for job in jobs:
game_id, title, url = job
if not should_process_jobs:
if game_id == continue_from:
should_process_jobs = True
else:
continue
r = requests.get(f"{url}/data.json")
if r.status_code != 200:
print(f"Missing data for {url}, probably invalid")
failed_jobs += url
continue
download_path = os.path.join(os.getcwd(), slugify(title))
print(f"Trying to download {title} ({game_id}) to {download_path}")
if not os.path.isdir(download_path):
os.mkdir(download_path)
try:
status, message = download_title(game_id, url, download_path)
print(f"{title}: {status}, {message}")
if status == ItchDownloadResult.SUCCESS:
jobs_successful.append((title, download_path))
else:
jobs_failed.append((status, title, url, message))
except Exception as e:
print(f"Download failed for {title} ({game_id}): {e}")
traceback.print_exc()
continue
print(f"\nAll done, downloaded files successfully for {len(jobs_successful)} title(s):")
for title, download_path in jobs_successful:
print(title)
print(f"\nDownloads failed for {len(jobs_failed)} title(s):")
for status, title, url, message in jobs_failed:
print(f"{title} - {url} - {status}: {message}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Downloads games from public Itch.io game jams.")
parser.add_argument("entries", help="path to the game jam entries.json file")
parser.add_argument("--continue-from", metavar="ID", help="skip all entries until the provided entry ID is found")
args = parser.parse_args()
continue_id = args.continue_from
if continue_id is not None:
try:
continue_id = int(continue_id)
except:
print("ID to continue from must be an integer.")
exit(1)
download_jam(args.entries, continue_from=continue_id)