mirror of
https://github.com/DragoonAethis/itch-dl.git
synced 2024-12-20 18:11:52 +01:00
Trial The Third: Start rewriting the thing
Wooo, someone wants to use this! Let's make it less embarrassing.
This commit is contained in:
parent
00cced1f41
commit
4a8f88b48e
2
.idea/itch-dl.iml
generated
2
.idea/itch-dl.iml
generated
@ -4,7 +4,7 @@
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<sourceFolder url="file://$MODULE_DIR$" isTestSource="false" />
|
||||
</content>
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="jdk" jdkName="Poetry (itch-dl)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
2
LICENSE
2
LICENSE
@ -1,6 +1,6 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2021 Dragoon Aethis
|
||||
Copyright (c) 2022 Dragoon Aethis
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
45
README.md
45
README.md
@ -1,43 +1,42 @@
|
||||
# itch-dl
|
||||
|
||||
Bulk download games from [itch.io](https://itch.io/). Currently only supports downloading game jams.
|
||||
Bulk download games from [itch.io](https://itch.io/).
|
||||
|
||||
What you'll need:
|
||||
|
||||
- Python 3.8+
|
||||
- `pip install -r requirements.txt`
|
||||
- For site mirroring, [wget](https://www.gnu.org/software/wget/) in your PATH.
|
||||
|
||||
On Arch, `pacman -S wget python python-requests python-slugify` works.
|
||||
- Can download game jams, browse pages (popular, newest, browse by tag...) and individual games.
|
||||
- Requires Python 3.8+, grab it from PyPI: `pip install itch-dl`
|
||||
- For development, use [Poetry](https://python-poetry.org/).
|
||||
- Optionally requires wget for site mirroring.
|
||||
|
||||
How to use this:
|
||||
|
||||
- Log into itch.io with the account you'd like to use for downloading.
|
||||
- Generate a new API key on your user account page: https://itch.io/user/settings/api-keys
|
||||
- Run the downloader: `python downloader.py --api-key <KEY> https://itch.io/jam/yourjamhere`
|
||||
- Run the downloader: `itch-dl --api-key <KEY> https://itch.io/jam/yourjamhere`
|
||||
- Wait. This is going to take a while.
|
||||
|
||||
The downloader is able to grab more or less everything you can download via the itch app.
|
||||
|
||||
It's expected that the downloader output will not be complete - logs are stupidly verbose and
|
||||
it prints a report on successful/failed downloads, so you must manually grab whatever was not
|
||||
handled for you automatically for some reason.
|
||||
The input URL can be any "Browse" page (top, popular, newest, filtered by tags, etc) or any
|
||||
game jam. The input can also be a path to a itch.io JSON file with game jam entries, or just
|
||||
a list of itch.io game URLs (not browse/jam pages!) to download.
|
||||
|
||||
The downloader also grabs the entry page HTML, which usually comes with controls and such. It
|
||||
does not download images, external assets and so on, just the text - if the Itch page dies,
|
||||
so will most elements on those downloaded pages. Controls should survive, though.
|
||||
**It's expected that the downloader output will not be complete** - logs are stupidly verbose
|
||||
and it prints a report on successful/failed downloads, so you must manually grab whatever was
|
||||
not handled for you automatically for some reason.
|
||||
|
||||
(There's a pedantic site mirroring toggle in the script, if you know what you're doing. You will
|
||||
need wget for that.)
|
||||
The downloader also grabs the entry page HTML, which usually comes with controls and such. By
|
||||
default, it does not download images, assets and so on, just the text - use `--mirror-web` to
|
||||
try and download these as well. This requires `wget` to be available in your `PATH`.
|
||||
|
||||
|
||||
## Cannot extract IDs?
|
||||
## Game Jam Entries JSON
|
||||
|
||||
Downloader can parse and download games from a game jam entries JSON file if you want to provide it.
|
||||
(The script basically automates the steps below, so if it's not able to do the same, please create
|
||||
an issue!)
|
||||
Downloader can parse and download games from a game jam entries JSON file if you need it.
|
||||
(The script basically automates the steps below, so if it's not able to do the same, please
|
||||
create an issue!)
|
||||
|
||||
- Go to your jam's page, ex. https://itch.io/jam/gbcompo21 and right-click -> View Source.
|
||||
- Ctrl-F for `"id":` - it should find that text once, followed by a number. Write it down.
|
||||
- (It you found it multiple times, grab the one after ViewJam something something.)
|
||||
- Download https://itch.io/jam/NUMBER/entries.json (replacing NUMBER with what you wrote down)
|
||||
- (It you found it multiple times, grab the one after I.ViewJam something something.)
|
||||
- Download https://itch.io/jam/ID/entries.json (replacing ID with what you wrote down).
|
||||
- Feed that to `itch-dl`!
|
||||
|
349
downloader.py
349
downloader.py
@ -1,349 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# Python 3.8+ and dependencies listed below required.
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
import time
|
||||
import shutil
|
||||
import hashlib
|
||||
import argparse
|
||||
import traceback
|
||||
import subprocess
|
||||
from enum import Enum
|
||||
|
||||
import requests
|
||||
from requests.adapters import HTTPAdapter
|
||||
from requests.packages.urllib3.util.retry import Retry
|
||||
|
||||
from slugify import slugify
|
||||
|
||||
WGET_PATH = shutil.which("wget")
|
||||
if WGET_PATH is None:
|
||||
print(f"Warning: wget not available, site mirroring will not work!")
|
||||
|
||||
# Try to download all site assets, images etc included.
|
||||
# You probably don't want this, but here you go!
|
||||
PEDANTIC_MIRRORING = False
|
||||
|
||||
ITCH_API = "https://api.itch.io"
|
||||
|
||||
|
||||
class ItchDownloadResult(Enum):
|
||||
SUCCESS = 0
|
||||
FAILURE = 1
|
||||
MISSING_DOWNLOAD = 2
|
||||
DOWNLOAD_TIMEOUT = 3
|
||||
|
||||
|
||||
class ItchDownloadError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class ItchApiClient():
|
||||
def __init__(self, base_url: str, api_key: str):
|
||||
self.base_url = base_url
|
||||
self.api_key = api_key
|
||||
|
||||
self.requests = requests.Session()
|
||||
|
||||
retry_strategy = Retry(
|
||||
total=5,
|
||||
backoff_factor=10,
|
||||
allowed_methods=["HEAD", "GET"],
|
||||
status_forcelist=[429, 500, 502, 503, 504]
|
||||
)
|
||||
|
||||
# No timeouts - set them explicitly on API calls below!
|
||||
adapter = HTTPAdapter(max_retries=retry_strategy)
|
||||
self.requests.mount("https://", adapter)
|
||||
self.requests.mount("http://", adapter)
|
||||
|
||||
def add_api_key(self, kwargs):
|
||||
# Adds the API key to request params, if one was not
|
||||
# already provided outside of the client.
|
||||
if 'data' in kwargs:
|
||||
params = kwargs['data']
|
||||
else:
|
||||
params = {}
|
||||
kwargs['data'] = params
|
||||
|
||||
if 'api_key' not in params:
|
||||
params['api_key'] = self.api_key
|
||||
|
||||
def get(self, endpoint: str, *args, **kwargs):
|
||||
self.add_api_key(kwargs)
|
||||
return self.requests.get(self.base_url + endpoint, *args, **kwargs)
|
||||
|
||||
|
||||
def download_file(client: ItchApiClient, upload_id: int, download_path: str, creds: dict, print_url: bool=False):
|
||||
# No timeouts, chunked uploads, default retry strategy, should be all good?
|
||||
try:
|
||||
with client.get(f"/uploads/{upload_id}/download", data=creds, stream=True) as r:
|
||||
r.raise_for_status()
|
||||
if print_url:
|
||||
print(f"Download URL: {r.url}")
|
||||
|
||||
with open(download_path, 'wb') as f:
|
||||
for chunk in r.iter_content(chunk_size=1048576): # 1MB chunks
|
||||
f.write(chunk)
|
||||
except requests.exceptions.HTTPError as e:
|
||||
raise ItchDownloadError(f"Unrecoverable download error: {e}")
|
||||
|
||||
|
||||
def get_download_keys(client: ItchApiClient):
|
||||
print("Fetching all download keys...")
|
||||
download_keys = {}
|
||||
page = 1
|
||||
|
||||
while True:
|
||||
print(f"Downloading page {page}...")
|
||||
try:
|
||||
r = client.get("/profile/owned-keys", data={"page": page}, timeout=15)
|
||||
r.raise_for_status()
|
||||
except Exception as e:
|
||||
print(f"Got error while fetching download keys: {e}")
|
||||
print(f"Let's just pretend this is enough and move on...")
|
||||
break
|
||||
|
||||
data = r.json()
|
||||
if 'owned_keys' not in data:
|
||||
break # Assuming we're out of keys already...
|
||||
|
||||
for key in data['owned_keys']:
|
||||
download_keys[key['game_id']] = key['id']
|
||||
|
||||
if len(data['owned_keys']) == data['per_page']:
|
||||
page += 1
|
||||
else:
|
||||
break
|
||||
|
||||
print(f"Fetched {len(download_keys)} download keys.")
|
||||
return download_keys
|
||||
|
||||
|
||||
def parse_jobs(jam_json: dict) -> list[tuple[int, str, str]]:
|
||||
if 'jam_games' not in jam_json:
|
||||
raise Exception("Provided JSON is not a valid itch.io jam JSON.")
|
||||
|
||||
# Extract (id, url) pairs from all the entries.
|
||||
return [(int(e['game']['id']), e['game']['title'], e['game']['url']) for e in jam_json['jam_games']]
|
||||
|
||||
|
||||
def get_game_jam_json(jam_path: str) -> dict:
|
||||
# Do we have an URL?
|
||||
jam_path = jam_path.strip()
|
||||
if jam_path.startswith("https://") or jam_path.startswith("http://"):
|
||||
r = requests.get(jam_path)
|
||||
if not r.ok:
|
||||
raise Exception(f"Could not download game jam site from {jam_path} (code {r.status_code}): {r.reason}")
|
||||
|
||||
jam_id_line = None
|
||||
for line in r.text.splitlines():
|
||||
if "ViewJam" in line:
|
||||
jam_id_line = line
|
||||
|
||||
if jam_id_line is None:
|
||||
raise Exception(f"Jam site did not contain the ID line - please provide the path to the game jam entries JSON file instead.")
|
||||
|
||||
found_ids = re.findall(r'\"id\":([0-9]+)', jam_id_line)
|
||||
if len(found_ids) == 0:
|
||||
raise Exception(f"Could not extract the jam ID from the provided site.")
|
||||
|
||||
jam_id = int(found_ids[0]) # Always grab the first one for now...
|
||||
print(f"Extracted jam ID: {jam_id}")
|
||||
|
||||
r = requests.get(f"https://itch.io/jam/{jam_id}/entries.json")
|
||||
if not r.ok:
|
||||
raise Exception(f"Could not download the game jam entries list.")
|
||||
|
||||
content = r.text
|
||||
elif os.path.isfile(jam_path):
|
||||
try:
|
||||
with open(jam_path) as f:
|
||||
content = f.read()
|
||||
except Exception as e:
|
||||
raise Exception(f"Could not open/read the game jam entries file: {e}")
|
||||
else:
|
||||
raise Exception(f"Provided game jam path is invalid (not a link/existing file).")
|
||||
|
||||
try:
|
||||
jam_json = json.loads(content)
|
||||
except json.decoder.JSONDecodeError:
|
||||
print(f"Provided game jam entries file is not a valid JSON file.")
|
||||
|
||||
return jam_json
|
||||
|
||||
|
||||
def download_jam(jam_path: str, download_to: str, api_key: str, continue_from: str=None):
|
||||
client = ItchApiClient(ITCH_API, api_key)
|
||||
jam_json = get_game_jam_json(jam_path)
|
||||
|
||||
# Check API key validity:
|
||||
profile_req = client.get("/profile")
|
||||
if not profile_req.ok:
|
||||
print(f"Provided API key appears to be invalid: {profile_req.text}")
|
||||
exit(1)
|
||||
|
||||
jobs = parse_jobs(jam_json)
|
||||
jobs_successful = []
|
||||
jobs_failed = []
|
||||
|
||||
download_keys = get_download_keys(client)
|
||||
game_id_to_meta = {} # dict[game_id: int, (title: str, url: str)]
|
||||
|
||||
for game_id, title, url in jobs:
|
||||
game_id_to_meta[game_id] = (title, url)
|
||||
|
||||
failed_game_ids = set()
|
||||
|
||||
# No "continue from"? Yep, start right away.
|
||||
should_process_jobs = continue_from is None
|
||||
|
||||
for game_id, title, url in jobs:
|
||||
label = f"{title} ({game_id})"
|
||||
if not should_process_jobs:
|
||||
if game_id == continue_from:
|
||||
should_process_jobs = True
|
||||
else:
|
||||
continue
|
||||
|
||||
try:
|
||||
download_path = os.path.join(download_to, slugify(title))
|
||||
if PEDANTIC_MIRRORING:
|
||||
site_mirror_path = os.path.join(download_to, "_sites")
|
||||
else:
|
||||
site_mirror_path = os.path.join(download_path, "site")
|
||||
os.makedirs(download_path, exist_ok=True)
|
||||
os.makedirs(site_mirror_path, exist_ok=True)
|
||||
except:
|
||||
raise ItchDownloadError(f"Could not create download directory: {download_path}")
|
||||
|
||||
print(f"Trying to download {label} to {download_path}")
|
||||
|
||||
if WGET_PATH is not None:
|
||||
print("Downloading site...")
|
||||
if PEDANTIC_MIRRORING:
|
||||
extra_wget_args = [
|
||||
"--timestamping",
|
||||
"--span-hosts",
|
||||
"--convert-links",
|
||||
"--adjust-extension",
|
||||
"--page-requisites",
|
||||
]
|
||||
else:
|
||||
extra_wget_args = []
|
||||
|
||||
wget = subprocess.run([
|
||||
WGET_PATH,
|
||||
*extra_wget_args,
|
||||
"--quiet",
|
||||
url
|
||||
], cwd=site_mirror_path)
|
||||
|
||||
if wget.returncode != 0:
|
||||
print(f"Warning: Site mirroring failed/incomplete.")
|
||||
|
||||
creds = {}
|
||||
if game_id in download_keys:
|
||||
creds['download_key_id'] = download_keys[game_id]
|
||||
print("Using {creds} for private uploads")
|
||||
|
||||
game_uploads_req = client.get(f"/games/{game_id}/uploads", data=creds, timeout=15)
|
||||
if not game_uploads_req.ok:
|
||||
raise ItchDownloadError(f"Could not fetch game uploads for {label}: {game_uploads_req.text}")
|
||||
|
||||
game_uploads = game_uploads_req.json()['uploads']
|
||||
print(f"Found {len(game_uploads)} upload(s)")
|
||||
|
||||
try:
|
||||
for upload in game_uploads:
|
||||
upload_id = upload['id']
|
||||
file_name = upload['filename']
|
||||
file_size = upload['size']
|
||||
upload_is_external = upload['storage'] == 'external'
|
||||
|
||||
print(f"Downloading '{file_name}' ({upload_id}), {file_size} bytes...")
|
||||
if upload_is_external:
|
||||
print("***********************************************************")
|
||||
print("* *")
|
||||
print("* WARNING: External storage - downloads will likely fail. *")
|
||||
print("* Check the URL displayed below manually! *")
|
||||
print("* *")
|
||||
print("***********************************************************")
|
||||
|
||||
target_path = os.path.join(download_path, file_name)
|
||||
try:
|
||||
download_file(client, upload_id, target_path, creds, print_url=upload_is_external)
|
||||
except ItchDownloadError as e:
|
||||
jobs_failed.append((game_id, file_name, str(e)))
|
||||
print(f"Download failed for {file_name}: {e}")
|
||||
continue
|
||||
|
||||
try:
|
||||
actual_file_size = os.stat(target_path).st_size
|
||||
if actual_file_size == file_size:
|
||||
jobs_successful.append((game_id, file_name))
|
||||
else:
|
||||
jobs_failed.append((game_id, file_name, f"File size is {actual_file_size}, expected {file_size}"))
|
||||
except FileNotFoundError:
|
||||
jobs_failed.append((game_id, file_name, "Could not download file"))
|
||||
|
||||
print(f"Done downloading {label}")
|
||||
except ItchDownloadError as e:
|
||||
failed_game_ids.append((game_id, str(e)))
|
||||
print(message)
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f"Critical error while downloading {label}: {e}")
|
||||
failed_game_ids.append((game_id, str(e)))
|
||||
traceback.print_exc()
|
||||
print(message)
|
||||
continue
|
||||
|
||||
successful_titles = {}
|
||||
for game_id, file_name in jobs_successful:
|
||||
if game_id not in successful_titles:
|
||||
successful_titles[game_id] = [file_name]
|
||||
|
||||
if any(successful_titles):
|
||||
print(f"\nAll done, downloaded files for {len(successful_titles)} title(s):")
|
||||
for game_id, files in successful_titles.items():
|
||||
print(f"{game_id_to_meta[game_id][0]}, {len(files)} file(s)")
|
||||
|
||||
if any(jobs_failed):
|
||||
print(f"\nDownloads failed for {len(jobs_failed)} file(s):")
|
||||
for game_id, file_name, message in jobs_failed:
|
||||
title, url = game_id_to_meta[game_id]
|
||||
print(f"{title} - {file_name} - {message}")
|
||||
print(f"Title URL: {url}")
|
||||
|
||||
if any(failed_game_ids):
|
||||
print(f"\nCompletely failed downloads for {len(failed_game_ids)} titles:")
|
||||
for game_id, message in failed_game_ids:
|
||||
title, url = game_id_to_meta[game_id]
|
||||
print(f"{title} ({game_id}) - {url} - {message}")
|
||||
|
||||
|
||||
def get_parser():
|
||||
parser = argparse.ArgumentParser(description="Downloads games from public Itch.io game jams.")
|
||||
parser.add_argument("entries", help="path to the game jam entries.json file")
|
||||
parser.add_argument("--api-key", metavar="key", required=True, help="itch.io API key from https://itch.io/user/settings/api-keys")
|
||||
parser.add_argument("--download-to", metavar="path", help="directory to save results into (default: current dir)")
|
||||
parser.add_argument("--continue-from", metavar="id", type=int, help="skip all entries until the provided entry ID is found")
|
||||
return parser
|
||||
|
||||
|
||||
def get_download_dir(args: argparse.Namespace) -> str:
|
||||
download_to = os.getcwd()
|
||||
if args.download_to is not None:
|
||||
download_to = os.path.normpath(args.download_to)
|
||||
os.makedirs(download_to)
|
||||
|
||||
return download_to
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = get_parser().parse_args()
|
||||
download_to = get_download_dir(args)
|
||||
download_jam(args.entries, download_to, args.api_key, continue_from=args.continue_from)
|
1
itch_dl/__init__.py
Normal file
1
itch_dl/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
__version__ = '0.1.0'
|
3
itch_dl/__main__.py
Normal file
3
itch_dl/__main__.py
Normal file
@ -0,0 +1,3 @@
|
||||
#!/usr/bin/env python3
|
||||
from itch_dl.cli import run
|
||||
run()
|
43
itch_dl/api.py
Normal file
43
itch_dl/api.py
Normal file
@ -0,0 +1,43 @@
|
||||
from typing import Optional
|
||||
|
||||
from requests import Session
|
||||
from urllib3.util.retry import Retry
|
||||
from requests.adapters import HTTPAdapter
|
||||
|
||||
from .consts import ITCH_API
|
||||
|
||||
|
||||
class ItchApiClient:
|
||||
def __init__(self, api_key: str, base_url: Optional[str] = None):
|
||||
self.base_url = base_url or ITCH_API
|
||||
self.api_key = api_key
|
||||
|
||||
self.requests = Session()
|
||||
|
||||
retry_strategy = Retry(
|
||||
total=5,
|
||||
backoff_factor=10,
|
||||
allowed_methods=["HEAD", "GET"],
|
||||
status_forcelist=[429, 500, 502, 503, 504]
|
||||
)
|
||||
|
||||
# No timeouts - set them explicitly on API calls below!
|
||||
adapter = HTTPAdapter(max_retries=retry_strategy)
|
||||
self.requests.mount("https://", adapter)
|
||||
self.requests.mount("http://", adapter)
|
||||
|
||||
def get(self, endpoint: str, append_api_key: bool = True, **kwargs):
|
||||
if append_api_key:
|
||||
params = kwargs.get('data') or {}
|
||||
|
||||
if 'api_key' not in params:
|
||||
params['api_key'] = self.api_key
|
||||
|
||||
kwargs['data'] = params
|
||||
|
||||
if endpoint.startswith("https://"):
|
||||
url = endpoint
|
||||
else:
|
||||
url = self.base_url + endpoint
|
||||
|
||||
return self.requests.get(url, **kwargs)
|
67
itch_dl/cli.py
Normal file
67
itch_dl/cli.py
Normal file
@ -0,0 +1,67 @@
|
||||
import os
|
||||
import logging
|
||||
import argparse
|
||||
|
||||
from .handlers import get_jobs_for_url_or_path
|
||||
from .downloader import drive_downloads
|
||||
from .keys import get_download_keys
|
||||
from .api import ItchApiClient
|
||||
logging.basicConfig()
|
||||
logging.getLogger().setLevel(logging.INFO)
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Bulk download stuff from Itch.io.")
|
||||
parser.add_argument("url_or_path",
|
||||
help="itch.io URL or path to a game jam entries.json file")
|
||||
parser.add_argument("--api-key", metavar="key", required=True,
|
||||
help="itch.io API key - https://itch.io/user/settings/api-keys")
|
||||
parser.add_argument("--urls-only", action="store_true",
|
||||
help="print scraped game URLs without downloading them")
|
||||
parser.add_argument("--download-to", metavar="path",
|
||||
help="directory to save results into (default: current dir)")
|
||||
parser.add_argument("--parallel", metavar="parallel", type=int, default=1,
|
||||
help="how many threads to use for downloading games (default: 1)")
|
||||
parser.add_argument("--mirror-web", action="store_true",
|
||||
help="try to fetch assets on game sites")
|
||||
parser.add_argument("--verbose", action="store_true",
|
||||
help="print verbose logs")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def run() -> int:
|
||||
args = parse_args()
|
||||
if args.verbose:
|
||||
logging.getLogger().setLevel(logging.DEBUG)
|
||||
|
||||
jobs = get_jobs_for_url_or_path(args.url_or_path, args.api_key)
|
||||
jobs = list(set(jobs)) # Deduplicate, just in case...
|
||||
logging.info(f"Found {len(jobs)} URL(s).")
|
||||
|
||||
if len(jobs) == 0:
|
||||
print("No URLs to download.")
|
||||
return 1
|
||||
|
||||
if args.urls_only:
|
||||
for job in jobs:
|
||||
print(job)
|
||||
|
||||
return 0
|
||||
|
||||
download_to = os.getcwd()
|
||||
if args.download_to is not None:
|
||||
download_to = os.path.normpath(args.download_to)
|
||||
os.makedirs(download_to, exist_ok=True)
|
||||
|
||||
client = ItchApiClient(args.api_key)
|
||||
|
||||
# Check API key validity:
|
||||
profile_req = client.get("/profile")
|
||||
if not profile_req.ok:
|
||||
print(f"Provided API key appears to be invalid: {profile_req.text}")
|
||||
exit(1)
|
||||
|
||||
# Grab all the download keys (there's no way to fetch them per title...):
|
||||
keys = get_download_keys(client)
|
||||
|
||||
return drive_downloads(jobs, download_to, args.api_key, keys, parallel=args.parallel)
|
29
itch_dl/consts.py
Normal file
29
itch_dl/consts.py
Normal file
@ -0,0 +1,29 @@
|
||||
from enum import Enum
|
||||
|
||||
ITCH_BASE = "itch.io"
|
||||
ITCH_URL = f"https://{ITCH_BASE}"
|
||||
ITCH_API = f"https://api.{ITCH_BASE}"
|
||||
|
||||
ITCH_BROWSER_TYPES = [
|
||||
"games",
|
||||
"tools",
|
||||
"game-assets",
|
||||
"comics",
|
||||
"books",
|
||||
"physical-games",
|
||||
"soundtracks",
|
||||
"game-mods",
|
||||
"misc",
|
||||
]
|
||||
|
||||
|
||||
class ItchDownloadResult(Enum):
|
||||
SUCCESS = 0
|
||||
FAILURE = 1
|
||||
MISSING_DOWNLOAD = 2
|
||||
DOWNLOAD_TIMEOUT = 3
|
||||
|
||||
|
||||
# I mean, not really a const but eh
|
||||
class ItchDownloadError(Exception):
|
||||
pass
|
251
itch_dl/downloader.py
Normal file
251
itch_dl/downloader.py
Normal file
@ -0,0 +1,251 @@
|
||||
import os
|
||||
import shutil
|
||||
import logging
|
||||
import traceback
|
||||
import subprocess
|
||||
from typing import Tuple, List, Dict, TypedDict, Optional
|
||||
|
||||
from slugify import slugify
|
||||
from requests.exceptions import HTTPError
|
||||
|
||||
from tqdm import tqdm
|
||||
from tqdm.contrib.concurrent import thread_map
|
||||
|
||||
from .api import ItchApiClient
|
||||
from .consts import ItchDownloadError, ItchDownloadResult
|
||||
|
||||
|
||||
# ------------------------------
|
||||
# --- OLD STUFF --- CUT HERE ---
|
||||
# ------------------------------
|
||||
|
||||
|
||||
WGET_PATH = shutil.which("wget")
|
||||
if WGET_PATH is None:
|
||||
print(f"Warning: wget not available, site mirroring will not work!")
|
||||
|
||||
|
||||
def download_file(client: ItchApiClient, upload_id: int, download_path: str, creds: dict, print_url: bool=False):
|
||||
# No timeouts, chunked uploads, default retry strategy, should be all good?
|
||||
try:
|
||||
with client.get(f"/uploads/{upload_id}/download", data=creds, stream=True) as r:
|
||||
r.raise_for_status()
|
||||
if print_url:
|
||||
print(f"Download URL: {r.url}")
|
||||
|
||||
with open(download_path, 'wb') as f:
|
||||
for chunk in r.iter_content(chunk_size=1048576): # 1MB chunks
|
||||
f.write(chunk)
|
||||
except HTTPError as e:
|
||||
raise ItchDownloadError(f"Unrecoverable download error: {e}")
|
||||
|
||||
|
||||
def get_meta_for_game_url(game_url: str) -> Tuple[int, str]:
|
||||
"""Finds the Game ID and Title for a Game URL."""
|
||||
data_url = game_url.rstrip("/") + "/data.json"
|
||||
data_req = requests.get(data_url)
|
||||
r.raise_for_status()
|
||||
|
||||
data_json = data_req.json()
|
||||
if not 'id' in data_json:
|
||||
raise ItchDownloadError(f"Cannot fetch the Game ID for URL: {game_url}")
|
||||
|
||||
return data_json['id']
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def download_jam(jam_path: str, download_to: str, api_key: str, continue_from: str=None):
|
||||
client = ItchApiClient(api_key)
|
||||
jam_json = get_game_jam_json(jam_path)
|
||||
|
||||
# Check API key validity:
|
||||
profile_req = client.get("/profile")
|
||||
if not profile_req.ok:
|
||||
print(f"Provided API key appears to be invalid: {profile_req.text}")
|
||||
exit(1)
|
||||
|
||||
jobs = parse_jobs(jam_json)
|
||||
jobs_successful = []
|
||||
jobs_failed = []
|
||||
|
||||
game_id_to_meta = {} # dict[game_id: int, (title: str, url: str)]
|
||||
|
||||
for game_id, title, url in jobs:
|
||||
game_id_to_meta[game_id] = (title, url)
|
||||
|
||||
failed_game_ids = set()
|
||||
|
||||
# No "continue from"? Yep, start right away.
|
||||
should_process_jobs = continue_from is None
|
||||
|
||||
for game_id, title, url in jobs:
|
||||
label = f"{title} ({game_id})"
|
||||
if not should_process_jobs:
|
||||
if game_id == continue_from:
|
||||
should_process_jobs = True
|
||||
else:
|
||||
continue
|
||||
|
||||
try:
|
||||
download_path = os.path.join(download_to, slugify(title))
|
||||
if PEDANTIC_MIRRORING:
|
||||
site_mirror_path = os.path.join(download_to, "_sites")
|
||||
else:
|
||||
site_mirror_path = os.path.join(download_path, "site")
|
||||
os.makedirs(download_path, exist_ok=True)
|
||||
os.makedirs(site_mirror_path, exist_ok=True)
|
||||
except:
|
||||
raise ItchDownloadError(f"Could not create download directory: {download_path}")
|
||||
|
||||
print(f"Trying to download {label} to {download_path}")
|
||||
|
||||
if WGET_PATH is not None:
|
||||
print("Downloading site...")
|
||||
if PEDANTIC_MIRRORING:
|
||||
extra_wget_args = [
|
||||
"--timestamping",
|
||||
"--span-hosts",
|
||||
"--convert-links",
|
||||
"--adjust-extension",
|
||||
"--page-requisites",
|
||||
]
|
||||
else:
|
||||
extra_wget_args = []
|
||||
|
||||
wget = subprocess.run([
|
||||
WGET_PATH,
|
||||
*extra_wget_args,
|
||||
"--quiet",
|
||||
url
|
||||
], cwd=site_mirror_path)
|
||||
|
||||
if wget.returncode != 0:
|
||||
print(f"Warning: Site mirroring failed/incomplete.")
|
||||
|
||||
creds = {}
|
||||
if game_id in self.download_keys:
|
||||
creds['download_key_id'] = self.download_keys[game_id]
|
||||
print("Using {creds} for private uploads")
|
||||
|
||||
game_uploads_req = client.get(f"/games/{game_id}/uploads", data=creds, timeout=15)
|
||||
if not game_uploads_req.ok:
|
||||
raise ItchDownloadError(f"Could not fetch game uploads for {label}: {game_uploads_req.text}")
|
||||
|
||||
game_uploads = game_uploads_req.json()['uploads']
|
||||
print(f"Found {len(game_uploads)} upload(s)")
|
||||
|
||||
try:
|
||||
for upload in game_uploads:
|
||||
upload_id = upload['id']
|
||||
file_name = upload['filename']
|
||||
file_size = upload['size']
|
||||
upload_is_external = upload['storage'] == 'external'
|
||||
|
||||
print(f"Downloading '{file_name}' ({upload_id}), {file_size} bytes...")
|
||||
if upload_is_external:
|
||||
print("***********************************************************")
|
||||
print("* *")
|
||||
print("* WARNING: External storage - downloads will likely fail. *")
|
||||
print("* Check the URL displayed below manually! *")
|
||||
print("* *")
|
||||
print("***********************************************************")
|
||||
|
||||
target_path = os.path.join(download_path, file_name)
|
||||
try:
|
||||
download_file(client, upload_id, target_path, creds, print_url=upload_is_external)
|
||||
except ItchDownloadError as e:
|
||||
jobs_failed.append((game_id, file_name, str(e)))
|
||||
print(f"Download failed for {file_name}: {e}")
|
||||
continue
|
||||
|
||||
try:
|
||||
actual_file_size = os.stat(target_path).st_size
|
||||
if actual_file_size == file_size:
|
||||
jobs_successful.append((game_id, file_name))
|
||||
else:
|
||||
jobs_failed.append((game_id, file_name, f"File size is {actual_file_size}, expected {file_size}"))
|
||||
except FileNotFoundError:
|
||||
jobs_failed.append((game_id, file_name, "Could not download file"))
|
||||
|
||||
print(f"Done downloading {label}")
|
||||
except ItchDownloadError as e:
|
||||
failed_game_ids.append((game_id, str(e)))
|
||||
print(message)
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f"Critical error while downloading {label}: {e}")
|
||||
failed_game_ids.append((game_id, str(e)))
|
||||
traceback.print_exc()
|
||||
print(message)
|
||||
continue
|
||||
|
||||
successful_titles = {}
|
||||
for game_id, file_name in jobs_successful:
|
||||
if game_id not in successful_titles:
|
||||
successful_titles[game_id] = [file_name]
|
||||
|
||||
if any(successful_titles):
|
||||
print(f"\nAll done, downloaded files for {len(successful_titles)} title(s):")
|
||||
for game_id, files in successful_titles.items():
|
||||
print(f"{game_id_to_meta[game_id][0]}, {len(files)} file(s)")
|
||||
|
||||
if any(jobs_failed):
|
||||
print(f"\nDownloads failed for {len(jobs_failed)} file(s):")
|
||||
for game_id, file_name, message in jobs_failed:
|
||||
title, url = game_id_to_meta[game_id]
|
||||
print(f"{title} - {file_name} - {message}")
|
||||
print(f"Title URL: {url}")
|
||||
|
||||
if any(failed_game_ids):
|
||||
print(f"\nCompletely failed downloads for {len(failed_game_ids)} titles:")
|
||||
for game_id, message in failed_game_ids:
|
||||
title, url = game_id_to_meta[game_id]
|
||||
print(f"{title} ({game_id}) - {url} - {message}")
|
||||
|
||||
|
||||
# ------------------------------
|
||||
# --- OLD STUFF --- CUT HERE ---
|
||||
# ------------------------------
|
||||
|
||||
|
||||
class GameAuthor(TypedDict, total=False):
|
||||
name: str
|
||||
url: str
|
||||
|
||||
|
||||
class GameMetadata(TypedDict, total=False):
|
||||
description: str
|
||||
|
||||
|
||||
class GameDownloadJob(TypedDict, total=False):
|
||||
url: str
|
||||
game_id: int
|
||||
title: str
|
||||
author: GameAuthor
|
||||
metadata: GameMetadata
|
||||
|
||||
|
||||
class GameDownloader:
|
||||
def __init__(self, download_to: str, api_key: str, keys: Dict[int, str]):
|
||||
self.download_to = download_to
|
||||
self.download_keys = keys
|
||||
|
||||
self.client = ItchApiClient(api_key)
|
||||
|
||||
def download(self, url: str):
|
||||
job = GameDownloadJob(url=url)
|
||||
raise NotImplementedError("Not yet!")
|
||||
|
||||
|
||||
def drive_downloads(jobs: List[str], download_to: str, api_key: str, keys: Dict[int, str], parallel: int = 1):
|
||||
downloader = GameDownloader(download_to, api_key, keys)
|
||||
|
||||
if parallel > 1:
|
||||
thread_map(downloader.download, jobs, max_workers=parallel, )
|
||||
else:
|
||||
for job in tqdm(jobs):
|
||||
downloader.download(job)
|
218
itch_dl/handlers.py
Normal file
218
itch_dl/handlers.py
Normal file
@ -0,0 +1,218 @@
|
||||
import re
|
||||
import json
|
||||
import os.path
|
||||
import logging
|
||||
import urllib.parse
|
||||
from typing import List, Optional
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .api import ItchApiClient
|
||||
from .consts import ITCH_BASE, ITCH_URL, ITCH_BROWSER_TYPES, ItchDownloadError
|
||||
|
||||
|
||||
def get_jobs_for_game_jam_json(game_jam_json: dict) -> List[str]:
|
||||
if 'jam_games' not in game_jam_json:
|
||||
raise Exception("Provided JSON is not a valid itch.io jam JSON.")
|
||||
|
||||
return [g['game']['url'] for g in game_jam_json['jam_games']]
|
||||
|
||||
|
||||
def get_int_after_marker_in_json(text: str, marker: str, key: str) -> Optional[int]:
|
||||
"""
|
||||
Many itch.io sites use a pattern like this: Most of the HTML page
|
||||
is prerendered, but certain interactive objects are handled with
|
||||
JavaScript initialized with `I.WidgetHandler({"id": 123, ...})`
|
||||
somewhere near the end of each page. Those config blocks often
|
||||
contain metadata like game/page IDs that we want to extract.
|
||||
"""
|
||||
marker_line: Optional[str] = None
|
||||
for line in reversed(text.splitlines()):
|
||||
marker_index = line.find(marker)
|
||||
if marker_index != -1:
|
||||
marker_line = line[marker_index:]
|
||||
break
|
||||
|
||||
if marker_line is None:
|
||||
return None
|
||||
|
||||
# Notice double-slashes in the f-string (not r-string)!
|
||||
pattern = f'\\"{key}\\":\\s?(\\d+)'
|
||||
|
||||
found_ints = re.findall(pattern, marker_line)
|
||||
if len(found_ints) != 1:
|
||||
return None
|
||||
|
||||
return int(found_ints[0])
|
||||
|
||||
|
||||
def get_game_jam_json(jam_url: str, client: ItchApiClient) -> dict:
|
||||
r = client.get(jam_url)
|
||||
if not r.ok:
|
||||
raise ItchDownloadError(f"Could not download the game jam site: {r.status_code} {r.reason}")
|
||||
|
||||
jam_id: Optional[int] = get_int_after_marker_in_json(r.text, "I.ViewJam", "id")
|
||||
if jam_id is None:
|
||||
raise ItchDownloadError("Provided site did not contain the Game Jam ID. Provide "
|
||||
"the path to the game jam entries JSON file instead, or "
|
||||
"create an itch-dl issue with the Game Jam URL.")
|
||||
|
||||
logging.info(f"Extracted Game Jam ID: {jam_id}")
|
||||
r = client.get(f"{ITCH_URL}/jam/{jam_id}/entries.json")
|
||||
if not r.ok:
|
||||
raise ItchDownloadError(f"Could not download the game jam entries list: {r.status_code} {r.reason}")
|
||||
|
||||
return r.json()
|
||||
|
||||
|
||||
def get_jobs_for_browse_url(url: str, client: ItchApiClient) -> List[str]:
|
||||
"""
|
||||
Every browser page has a hidden RSS feed that can be accessed by
|
||||
appending .xml to its URL. An optional "page" argument lets us
|
||||
iterate over their contents. When no more elements are available,
|
||||
the last returned <channel> has no <item> children.
|
||||
|
||||
The input URL is cleaned in the main URL handler, so append the
|
||||
.xml?page=N suffix and iterate until we've caught 'em all.
|
||||
"""
|
||||
page = 1
|
||||
found_urls = set()
|
||||
logging.info(f"Scraping game URLs from RSS feeds for %s", url)
|
||||
|
||||
while True:
|
||||
logging.info(f"Downloading page {page} (found {len(found_urls)} URLs total)")
|
||||
r = client.get(f"{url}.xml?page={page}", append_api_key=False)
|
||||
if not r.ok:
|
||||
logging.info("RSS feed returned %s, finished.", r.reason)
|
||||
break
|
||||
|
||||
soup = BeautifulSoup(r.text, features="xml")
|
||||
rss_items = soup.find_all("item")
|
||||
if len(rss_items) < 1:
|
||||
logging.info("No more items, finished.")
|
||||
break
|
||||
|
||||
logging.info(f"Found {len(rss_items)} items.")
|
||||
for item in rss_items:
|
||||
link_node = item.find("link")
|
||||
if link_node is None:
|
||||
continue
|
||||
|
||||
node_url = link_node.text.strip()
|
||||
if len(node_url) > 0:
|
||||
found_urls.add(node_url)
|
||||
|
||||
page += 1
|
||||
|
||||
if len(found_urls) == 0:
|
||||
raise ItchDownloadError("No game URLs found to download.")
|
||||
|
||||
return list(found_urls)
|
||||
|
||||
|
||||
def get_jobs_for_itch_url(url: str, client: ItchApiClient) -> List[str]:
|
||||
if url.startswith("http://"):
|
||||
logging.info("HTTP link provided, upgrading to HTTPS")
|
||||
url = "https://" + url[7:]
|
||||
|
||||
if url.startswith(f"https://www.{ITCH_BASE}/"):
|
||||
logging.info(f"Correcting www.{ITCH_BASE} to {ITCH_BASE}")
|
||||
url = ITCH_URL + '/' + url[20:]
|
||||
|
||||
url_parts = urllib.parse.urlparse(url)
|
||||
url_path_parts: List[str] = [x for x in str(url_parts.path).split('/') if len(x) > 0]
|
||||
|
||||
if url_parts.netloc == ITCH_BASE:
|
||||
if len(url_path_parts) == 0:
|
||||
raise NotImplementedError("itch-dl cannot download the entirety of itch.io.")
|
||||
# (yet) (also leafo would not be happy with the bandwidth bill)
|
||||
|
||||
site = url_path_parts[0]
|
||||
|
||||
if site == "jam": # Game jams
|
||||
if len(url_path_parts) < 2:
|
||||
raise ValueError(f"Incomplete game jam URL: {url}")
|
||||
|
||||
logging.info("Fetching Game Jam JSON...")
|
||||
clean_game_jam_url = f"{ITCH_URL}/jam/{url_path_parts[1]}"
|
||||
game_jam_json = get_game_jam_json(clean_game_jam_url, client)
|
||||
return get_jobs_for_game_jam_json(game_jam_json)
|
||||
|
||||
elif site in ITCH_BROWSER_TYPES: # Browser
|
||||
clean_browse_url = '/'.join([ITCH_URL, *url_path_parts])
|
||||
return get_jobs_for_browse_url(clean_browse_url, client)
|
||||
|
||||
elif site in ("b", "bundle"): # Bundles
|
||||
raise NotImplementedError("itch-dl cannot download bundles yet.")
|
||||
|
||||
elif site in ("j", "jobs"): # Jobs...
|
||||
raise ValueError("itch-dl cannot download a job.")
|
||||
|
||||
elif site in ("t", "board", "community"): # Forums
|
||||
raise ValueError("itch-dl cannot download forums.")
|
||||
|
||||
elif site == "profile": # Forum Profile
|
||||
if len(url_path_parts) >= 2:
|
||||
username = url_path_parts[1]
|
||||
logging.info("Correcting user profile to creator page for %s", username)
|
||||
return get_jobs_for_itch_url(f"https://{username}.{ITCH_BASE}", client)
|
||||
|
||||
raise ValueError("itch-dl expects a username in profile links.")
|
||||
|
||||
# Something else?
|
||||
raise NotImplementedError(f"itch-dl does not understand \"{site}\" URLs. Please file a new issue.")
|
||||
|
||||
elif url_parts.netloc.endswith(f".{ITCH_BASE}"):
|
||||
if len(url_path_parts) == 0: # Author
|
||||
# TODO: Find I.UserPage, regex for "user_id": [0-9]+, find the responsible API?
|
||||
raise NotImplementedError("itch-dl cannot download author pages yet.")
|
||||
|
||||
else: # Single game
|
||||
# Just clean and return the URL:
|
||||
return [f"https://{url_parts.netloc}/{url_path_parts[0]}"]
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unknown domain: {url_parts.netloc}")
|
||||
|
||||
|
||||
def get_jobs_for_path(path: str) -> List[str]:
|
||||
try: # Game Jam Entries JSON?
|
||||
with open(path) as f:
|
||||
json_data = json.load(f)
|
||||
|
||||
if not isinstance(json_data, dict):
|
||||
raise ValueError(f"File does not contain a JSON dict: {path}")
|
||||
|
||||
if 'jam_games' in json_data:
|
||||
logging.info("Parsing provided file as a Game Jam Entries JSON...")
|
||||
return get_jobs_for_game_jam_json(json_data)
|
||||
except json.JSONDecodeError:
|
||||
pass # Not a valid JSON, okay...
|
||||
|
||||
url_list = []
|
||||
with open(path) as f: # Plain job list?
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line.startswith("https://") or line.startswith("http://"):
|
||||
url_list.append(line)
|
||||
|
||||
if len(url_list) > 0:
|
||||
logging.info("Parsing provided file as a list of URLs to fetch...")
|
||||
return url_list
|
||||
|
||||
raise ValueError(f"File format is unknown - cannot read URLs to download.")
|
||||
|
||||
|
||||
def get_jobs_for_url_or_path(path_or_url: str, api_key: str) -> List[str]:
|
||||
"""Returns a list of Game URLs for a given itch.io URL or file."""
|
||||
path_or_url = path_or_url.strip()
|
||||
|
||||
if path_or_url.startswith("http://"):
|
||||
logging.info("HTTP link provided, upgrading to HTTPS")
|
||||
path_or_url = "https://" + path_or_url[7:]
|
||||
|
||||
if path_or_url.startswith("https://"):
|
||||
client = ItchApiClient(api_key)
|
||||
return get_jobs_for_itch_url(path_or_url, client)
|
||||
elif os.path.isfile(path_or_url):
|
||||
return get_jobs_for_path(path_or_url)
|
31
itch_dl/keys.py
Normal file
31
itch_dl/keys.py
Normal file
@ -0,0 +1,31 @@
|
||||
import logging
|
||||
from typing import Dict
|
||||
|
||||
from .api import ItchApiClient
|
||||
|
||||
|
||||
def get_download_keys(client: ItchApiClient) -> Dict[int, str]:
|
||||
logging.info("Fetching all download keys...")
|
||||
download_keys = {}
|
||||
page = 1
|
||||
|
||||
while True:
|
||||
logging.info(f"Downloading page {page} (found {len(download_keys)} keys total)")
|
||||
r = client.get("/profile/owned-keys", data={"page": page}, timeout=15)
|
||||
if not r.ok:
|
||||
break
|
||||
|
||||
data = r.json()
|
||||
if 'owned_keys' not in data:
|
||||
break # Assuming we're out of keys already...
|
||||
|
||||
for key in data['owned_keys']:
|
||||
download_keys[key['game_id']] = key['id']
|
||||
|
||||
if len(data['owned_keys']) == data['per_page']:
|
||||
page += 1
|
||||
else:
|
||||
break
|
||||
|
||||
logging.info(f"Fetched {len(download_keys)} download keys.")
|
||||
return download_keys
|
@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "itch-dl"
|
||||
packages = [{ include = "itchdl" }]
|
||||
packages = [{ include = "itch_dl" }]
|
||||
version = "0.1.0"
|
||||
description = "itch.io bulk game downloader"
|
||||
homepage = "https://github.com/DragoonAethis/itch-dl"
|
||||
@ -24,11 +24,15 @@ classifiers = [
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.8"
|
||||
requests = "^2.26.0"
|
||||
python-slugify = "^5.0.0"
|
||||
tqdm = "^4.64.0"
|
||||
urllib3 = "^1.26.9"
|
||||
requests = "^2.27.1"
|
||||
python-slugify = "^6.1.2"
|
||||
beautifulsoup4 = "^4.11.1"
|
||||
lxml = "^4.8.0"
|
||||
|
||||
[tool.poetry.dev-dependencies]
|
||||
pytest = "^6.2"
|
||||
[tool.poetry.scripts]
|
||||
itch-dl = "itch_dl.cli:run"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core>=1.0.0"]
|
||||
|
Loading…
Reference in New Issue
Block a user