mirror of
https://github.com/DragoonAethis/itch-dl.git
synced 2024-12-21 02:21:52 +01:00
Trial The First: Maybe It Works This Time
This commit is contained in:
parent
bdd95f62a8
commit
5d0b8e1e99
32
README.md
32
README.md
@ -1,2 +1,30 @@
|
||||
# ItchJamDownloader
|
||||
Download all games from a public Itch.io Game Jam
|
||||
# Itch Jam Downloader
|
||||
|
||||
Downloads all games from a public Itch.io Game Jam.
|
||||
|
||||
What you'll need:
|
||||
|
||||
- Python 3.8+
|
||||
- `pip install -r requirements.txt`
|
||||
- [chromedriver](https://chromedriver.chromium.org/downloads) somewhere in your PATH
|
||||
|
||||
On Arch, `pacman -S python chromium python-selenium python-requests python-slugify` works.
|
||||
|
||||
How to use this:
|
||||
|
||||
- Go to your jam's page, ex. https://itch.io/jam/gbcompo21 and right-click -> View Source.
|
||||
- Ctrl-F for `"id":` - it should find that text once, followed by a number. Write it down.
|
||||
- Download https://itch.io/jam/NUMBER/entries.json (replacing NUMBER with what you wrote down)
|
||||
- Run the downloader: `python downloader.py entries.json`
|
||||
- Wait. This is going to take a while.
|
||||
|
||||
**This downloader does not (and probably will not) support HTML5-only games.** (For some of
|
||||
these, you might get lucky by hitting F12 while the game loads and grabbing what's in there.)
|
||||
|
||||
It's expected that the downloader output will not be complete - logs are stupidly verbose and
|
||||
it prints a report on successful/failed downloads, so you must manually grab whatever was not
|
||||
handled for you automatically for some reason.
|
||||
|
||||
The downloader also grabs the entry page HTML, which usually comes with controls and such. It
|
||||
does not download images, external assets and so on, just the text - if the Itch page dies,
|
||||
so will most elements on those downloaded pages. Controls should survive, though.
|
||||
|
241
downloader.py
Normal file
241
downloader.py
Normal file
@ -0,0 +1,241 @@
|
||||
#!/usr/bin/env python3
|
||||
# Python 3.8+ and dependencies listed below required.
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import time
|
||||
import hashlib
|
||||
import argparse
|
||||
import traceback
|
||||
from enum import Enum
|
||||
from multiprocessing import Pool
|
||||
|
||||
import requests
|
||||
from slugify import slugify
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.common.exceptions import NoSuchElementException, TimeoutException
|
||||
|
||||
|
||||
class ItchDownloadResult(Enum):
|
||||
SUCCESS = 0
|
||||
FAILURE = 1
|
||||
MISSING_DOWNLOAD = 2
|
||||
DOWNLOAD_TIMEOUT = 3
|
||||
|
||||
|
||||
def parse_jobs(jam_json: dict) -> list[tuple[int, str, str]]:
|
||||
if 'jam_games' not in jam_json:
|
||||
raise Exception("Provided JSON is not a valid itch.io jam JSON.")
|
||||
|
||||
# Extract (id, url) pairs from all the entries.
|
||||
return [(e['game']['id'], e['game']['title'], e['game']['url']) for e in jam_json['jam_games']]
|
||||
|
||||
|
||||
def try_extract_download_links(driver: webdriver.Chrome, title_url: str) -> list[str]:
|
||||
elems = driver.find_elements(By.CLASS_NAME, "download_btn")
|
||||
if len(elems) == 0:
|
||||
raise NoSuchElementException("No download links found.")
|
||||
|
||||
cookie = driver.get_cookie("itchio_token")['value']
|
||||
file_ids = [elem.get_attribute("data-upload_id") for elem in elems]
|
||||
file_urls = []
|
||||
|
||||
for file_id in file_ids:
|
||||
meta_url = f"{title_url}/file/{file_id}"
|
||||
r = requests.post(meta_url, data={"csrf_token": cookie})
|
||||
if r.ok:
|
||||
file_urls.append(r.json()['url'])
|
||||
else:
|
||||
print(f"Error downloading metadata for file {file_id} (status {r.status_code}): {r.text}")
|
||||
|
||||
print(f"Extracted URLs: {file_urls}")
|
||||
return file_urls
|
||||
|
||||
|
||||
def download_link(link: str, path: str) -> tuple[bool, str]:
|
||||
r = requests.get(link)
|
||||
if not r.ok:
|
||||
return (False, r.reason)
|
||||
|
||||
# The bytes we need:
|
||||
content = r.content
|
||||
|
||||
# Figure out the filename:
|
||||
if 'Content-Disposition' in r.headers:
|
||||
name = r.headers['Content-Disposition']
|
||||
name = name.removeprefix('attachment; filename="').removesuffix('"')
|
||||
else: # uhhhh random bullshit go, good luck?
|
||||
md5 = hashlib.md5()
|
||||
md5.update(content)
|
||||
name = md5.hexdigest()
|
||||
|
||||
# Make sure we don't overwrite files with the same name.
|
||||
fcounter = 1
|
||||
filename = f"{path}/{name}"
|
||||
while os.path.exists(filename):
|
||||
fcounter += 1
|
||||
filename = f"{path}/{name}.{fcounter}"
|
||||
|
||||
try:
|
||||
with open(filename, 'wb') as f:
|
||||
f.write(content)
|
||||
except Exception as e:
|
||||
return (False, f"Cannot write output file: {e}")
|
||||
|
||||
return (True, "Success")
|
||||
|
||||
|
||||
def download_files(links, path) -> list[tuple[bool, str]]:
|
||||
if len(links) == 0:
|
||||
print(f"Nothing to download into {path}")
|
||||
return []
|
||||
|
||||
with Pool(len(links)) as p:
|
||||
results = p.starmap(download_link, [(link, path) for link in links])
|
||||
return results
|
||||
|
||||
|
||||
def parse_download_results(results, method) -> tuple[ItchDownloadResult, str]:
|
||||
global_success = True
|
||||
for success, reason in results:
|
||||
if not success:
|
||||
print(f"Download failed: {reason}")
|
||||
global_success = False
|
||||
|
||||
if global_success:
|
||||
return (ItchDownloadResult.SUCCESS, f"Method #{method} successful.")
|
||||
else:
|
||||
return (ItchDownloadResult.FAILURE, f"Method #{method} partially successful (downloads failed).")
|
||||
|
||||
|
||||
def download_title(title_id: int, title_url: str, download_path: str) -> (ItchDownloadResult, str):
|
||||
options = Options()
|
||||
options.add_argument("--headless")
|
||||
|
||||
with webdriver.Chrome(options=options) as driver:
|
||||
wait = WebDriverWait(driver, timeout=15)
|
||||
driver.get(title_url)
|
||||
|
||||
with open(f"{download_path}/index.html", 'w') as f:
|
||||
f.write(driver.page_source)
|
||||
|
||||
skip_purchase_locator = (By.CLASS_NAME, "direct_download_btn")
|
||||
|
||||
try:
|
||||
print("Trying method #1: Purchase Workflow")
|
||||
elem = driver.find_element(By.CLASS_NAME, "buy_btn")
|
||||
elem.click()
|
||||
|
||||
elem = wait.until(EC.presence_of_element_located(skip_purchase_locator))
|
||||
elem.click()
|
||||
|
||||
wait.until(EC.number_of_windows_to_be(2))
|
||||
time.sleep(1)
|
||||
|
||||
first_tab = driver.current_window_handle
|
||||
for window_handle in driver.window_handles:
|
||||
if window_handle != first_tab:
|
||||
driver.switch_to.window(window_handle)
|
||||
break
|
||||
|
||||
# We're now on the main downloads page.
|
||||
download_links = try_extract_download_links(driver, title_url)
|
||||
results = download_files(download_links, download_path)
|
||||
return parse_download_results(results, 1)
|
||||
except TimeoutException:
|
||||
print("Method #1 took too long - sleeping for 1m to avoid ~ mystery funsies ~")
|
||||
time.sleep(60)
|
||||
|
||||
return ItchDownloadResult.DOWNLOAD_TIMEOUT, "Download timed out"
|
||||
except NoSuchElementException:
|
||||
print("Method #1 failed.")
|
||||
|
||||
try:
|
||||
print("Trying method #2: Direct Download Workflow")
|
||||
download_links = try_extract_download_links(driver, title_url)
|
||||
results = download_files(download_links, download_path)
|
||||
return parse_download_results(results, 2)
|
||||
except NoSuchElementException:
|
||||
print("Method #2 failed.")
|
||||
|
||||
print("File links missing/no method able to handle target URL.")
|
||||
return ItchDownloadResult.MISSING_DOWNLOAD, "No download method worked."
|
||||
|
||||
def download_jam(path_to_json: str, continue_from: str=None):
|
||||
try:
|
||||
with open(path_to_json) as f:
|
||||
jam_json = json.load(f)
|
||||
except FileNotFoundError:
|
||||
print(f"File {path_to_json} not found.")
|
||||
except json.decoder.JSONDecodeError:
|
||||
print(F"Provided file is not a valid JSON file.")
|
||||
|
||||
jobs = parse_jobs(jam_json)
|
||||
jobs_successful = []
|
||||
jobs_failed = []
|
||||
|
||||
# No "continue from"? Yep, start right away.
|
||||
should_process_jobs = continue_from is None
|
||||
|
||||
for job in jobs:
|
||||
game_id, title, url = job
|
||||
if not should_process_jobs:
|
||||
if game_id == continue_from:
|
||||
should_process_jobs = True
|
||||
else:
|
||||
continue
|
||||
|
||||
r = requests.get(f"{url}/data.json")
|
||||
if r.status_code != 200:
|
||||
print(f"Missing data for {url}, probably invalid")
|
||||
failed_jobs += url
|
||||
continue
|
||||
|
||||
download_path = os.path.join(os.getcwd(), slugify(title))
|
||||
print(f"Trying to download {title} ({game_id}) to {download_path}")
|
||||
|
||||
if not os.path.isdir(download_path):
|
||||
os.mkdir(download_path)
|
||||
|
||||
try:
|
||||
status, message = download_title(game_id, url, download_path)
|
||||
print(f"{title}: {status}, {message}")
|
||||
|
||||
if status == ItchDownloadResult.SUCCESS:
|
||||
jobs_successful.append((title, download_path))
|
||||
else:
|
||||
jobs_failed.append((status, title, url, message))
|
||||
except Exception as e:
|
||||
print(f"Download failed for {title} ({game_id}): {e}")
|
||||
traceback.print_exc()
|
||||
continue
|
||||
|
||||
print(f"\nAll done, downloaded files successfully for {len(jobs_successful)} title(s):")
|
||||
for title, download_path in jobs_successful:
|
||||
print(title)
|
||||
|
||||
print(f"\nDownloads failed for {len(jobs_failed)} title(s):")
|
||||
for status, title, url, message in jobs_failed:
|
||||
print(f"{title} - {url} - {status}: {message}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Downloads games from public Itch.io game jams.")
|
||||
parser.add_argument("entries", help="path to the game jam entries.json file")
|
||||
parser.add_argument("--continue-from", metavar="ID", help="skip all entries until the provided entry ID is found")
|
||||
args = parser.parse_args()
|
||||
|
||||
continue_id = args.continue_from
|
||||
if continue_id is not None:
|
||||
try:
|
||||
continue_id = int(continue_id)
|
||||
except:
|
||||
print("ID to continue from must be an integer.")
|
||||
exit(1)
|
||||
|
||||
download_jam(args.entries, continue_from=continue_id)
|
3
requirements.txt
Normal file
3
requirements.txt
Normal file
@ -0,0 +1,3 @@
|
||||
selenium
|
||||
requests
|
||||
python-slugify
|
Loading…
Reference in New Issue
Block a user