1
0
forked from Mirrors/itch-dl

Trial The Third: Start rewriting the thing

Wooo, someone wants to use this! Let's make it less embarrassing.
This commit is contained in:
Ryszard Knop
2022-05-15 02:02:45 +02:00
parent 00cced1f41
commit 4a8f88b48e
13 changed files with 676 additions and 379 deletions

1
itch_dl/__init__.py Normal file
View File

@@ -0,0 +1 @@
__version__ = '0.1.0'

3
itch_dl/__main__.py Normal file
View File

@@ -0,0 +1,3 @@
#!/usr/bin/env python3
from itch_dl.cli import run
run()

43
itch_dl/api.py Normal file
View File

@@ -0,0 +1,43 @@
from typing import Optional
from requests import Session
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from .consts import ITCH_API
class ItchApiClient:
def __init__(self, api_key: str, base_url: Optional[str] = None):
self.base_url = base_url or ITCH_API
self.api_key = api_key
self.requests = Session()
retry_strategy = Retry(
total=5,
backoff_factor=10,
allowed_methods=["HEAD", "GET"],
status_forcelist=[429, 500, 502, 503, 504]
)
# No timeouts - set them explicitly on API calls below!
adapter = HTTPAdapter(max_retries=retry_strategy)
self.requests.mount("https://", adapter)
self.requests.mount("http://", adapter)
def get(self, endpoint: str, append_api_key: bool = True, **kwargs):
if append_api_key:
params = kwargs.get('data') or {}
if 'api_key' not in params:
params['api_key'] = self.api_key
kwargs['data'] = params
if endpoint.startswith("https://"):
url = endpoint
else:
url = self.base_url + endpoint
return self.requests.get(url, **kwargs)

67
itch_dl/cli.py Normal file
View File

@@ -0,0 +1,67 @@
import os
import logging
import argparse
from .handlers import get_jobs_for_url_or_path
from .downloader import drive_downloads
from .keys import get_download_keys
from .api import ItchApiClient
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)
def parse_args():
parser = argparse.ArgumentParser(description="Bulk download stuff from Itch.io.")
parser.add_argument("url_or_path",
help="itch.io URL or path to a game jam entries.json file")
parser.add_argument("--api-key", metavar="key", required=True,
help="itch.io API key - https://itch.io/user/settings/api-keys")
parser.add_argument("--urls-only", action="store_true",
help="print scraped game URLs without downloading them")
parser.add_argument("--download-to", metavar="path",
help="directory to save results into (default: current dir)")
parser.add_argument("--parallel", metavar="parallel", type=int, default=1,
help="how many threads to use for downloading games (default: 1)")
parser.add_argument("--mirror-web", action="store_true",
help="try to fetch assets on game sites")
parser.add_argument("--verbose", action="store_true",
help="print verbose logs")
return parser.parse_args()
def run() -> int:
args = parse_args()
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
jobs = get_jobs_for_url_or_path(args.url_or_path, args.api_key)
jobs = list(set(jobs)) # Deduplicate, just in case...
logging.info(f"Found {len(jobs)} URL(s).")
if len(jobs) == 0:
print("No URLs to download.")
return 1
if args.urls_only:
for job in jobs:
print(job)
return 0
download_to = os.getcwd()
if args.download_to is not None:
download_to = os.path.normpath(args.download_to)
os.makedirs(download_to, exist_ok=True)
client = ItchApiClient(args.api_key)
# Check API key validity:
profile_req = client.get("/profile")
if not profile_req.ok:
print(f"Provided API key appears to be invalid: {profile_req.text}")
exit(1)
# Grab all the download keys (there's no way to fetch them per title...):
keys = get_download_keys(client)
return drive_downloads(jobs, download_to, args.api_key, keys, parallel=args.parallel)

29
itch_dl/consts.py Normal file
View File

@@ -0,0 +1,29 @@
from enum import Enum
ITCH_BASE = "itch.io"
ITCH_URL = f"https://{ITCH_BASE}"
ITCH_API = f"https://api.{ITCH_BASE}"
ITCH_BROWSER_TYPES = [
"games",
"tools",
"game-assets",
"comics",
"books",
"physical-games",
"soundtracks",
"game-mods",
"misc",
]
class ItchDownloadResult(Enum):
SUCCESS = 0
FAILURE = 1
MISSING_DOWNLOAD = 2
DOWNLOAD_TIMEOUT = 3
# I mean, not really a const but eh
class ItchDownloadError(Exception):
pass

251
itch_dl/downloader.py Normal file
View File

@@ -0,0 +1,251 @@
import os
import shutil
import logging
import traceback
import subprocess
from typing import Tuple, List, Dict, TypedDict, Optional
from slugify import slugify
from requests.exceptions import HTTPError
from tqdm import tqdm
from tqdm.contrib.concurrent import thread_map
from .api import ItchApiClient
from .consts import ItchDownloadError, ItchDownloadResult
# ------------------------------
# --- OLD STUFF --- CUT HERE ---
# ------------------------------
WGET_PATH = shutil.which("wget")
if WGET_PATH is None:
print(f"Warning: wget not available, site mirroring will not work!")
def download_file(client: ItchApiClient, upload_id: int, download_path: str, creds: dict, print_url: bool=False):
# No timeouts, chunked uploads, default retry strategy, should be all good?
try:
with client.get(f"/uploads/{upload_id}/download", data=creds, stream=True) as r:
r.raise_for_status()
if print_url:
print(f"Download URL: {r.url}")
with open(download_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1048576): # 1MB chunks
f.write(chunk)
except HTTPError as e:
raise ItchDownloadError(f"Unrecoverable download error: {e}")
def get_meta_for_game_url(game_url: str) -> Tuple[int, str]:
"""Finds the Game ID and Title for a Game URL."""
data_url = game_url.rstrip("/") + "/data.json"
data_req = requests.get(data_url)
r.raise_for_status()
data_json = data_req.json()
if not 'id' in data_json:
raise ItchDownloadError(f"Cannot fetch the Game ID for URL: {game_url}")
return data_json['id']
def download_jam(jam_path: str, download_to: str, api_key: str, continue_from: str=None):
client = ItchApiClient(api_key)
jam_json = get_game_jam_json(jam_path)
# Check API key validity:
profile_req = client.get("/profile")
if not profile_req.ok:
print(f"Provided API key appears to be invalid: {profile_req.text}")
exit(1)
jobs = parse_jobs(jam_json)
jobs_successful = []
jobs_failed = []
game_id_to_meta = {} # dict[game_id: int, (title: str, url: str)]
for game_id, title, url in jobs:
game_id_to_meta[game_id] = (title, url)
failed_game_ids = set()
# No "continue from"? Yep, start right away.
should_process_jobs = continue_from is None
for game_id, title, url in jobs:
label = f"{title} ({game_id})"
if not should_process_jobs:
if game_id == continue_from:
should_process_jobs = True
else:
continue
try:
download_path = os.path.join(download_to, slugify(title))
if PEDANTIC_MIRRORING:
site_mirror_path = os.path.join(download_to, "_sites")
else:
site_mirror_path = os.path.join(download_path, "site")
os.makedirs(download_path, exist_ok=True)
os.makedirs(site_mirror_path, exist_ok=True)
except:
raise ItchDownloadError(f"Could not create download directory: {download_path}")
print(f"Trying to download {label} to {download_path}")
if WGET_PATH is not None:
print("Downloading site...")
if PEDANTIC_MIRRORING:
extra_wget_args = [
"--timestamping",
"--span-hosts",
"--convert-links",
"--adjust-extension",
"--page-requisites",
]
else:
extra_wget_args = []
wget = subprocess.run([
WGET_PATH,
*extra_wget_args,
"--quiet",
url
], cwd=site_mirror_path)
if wget.returncode != 0:
print(f"Warning: Site mirroring failed/incomplete.")
creds = {}
if game_id in self.download_keys:
creds['download_key_id'] = self.download_keys[game_id]
print("Using {creds} for private uploads")
game_uploads_req = client.get(f"/games/{game_id}/uploads", data=creds, timeout=15)
if not game_uploads_req.ok:
raise ItchDownloadError(f"Could not fetch game uploads for {label}: {game_uploads_req.text}")
game_uploads = game_uploads_req.json()['uploads']
print(f"Found {len(game_uploads)} upload(s)")
try:
for upload in game_uploads:
upload_id = upload['id']
file_name = upload['filename']
file_size = upload['size']
upload_is_external = upload['storage'] == 'external'
print(f"Downloading '{file_name}' ({upload_id}), {file_size} bytes...")
if upload_is_external:
print("***********************************************************")
print("* *")
print("* WARNING: External storage - downloads will likely fail. *")
print("* Check the URL displayed below manually! *")
print("* *")
print("***********************************************************")
target_path = os.path.join(download_path, file_name)
try:
download_file(client, upload_id, target_path, creds, print_url=upload_is_external)
except ItchDownloadError as e:
jobs_failed.append((game_id, file_name, str(e)))
print(f"Download failed for {file_name}: {e}")
continue
try:
actual_file_size = os.stat(target_path).st_size
if actual_file_size == file_size:
jobs_successful.append((game_id, file_name))
else:
jobs_failed.append((game_id, file_name, f"File size is {actual_file_size}, expected {file_size}"))
except FileNotFoundError:
jobs_failed.append((game_id, file_name, "Could not download file"))
print(f"Done downloading {label}")
except ItchDownloadError as e:
failed_game_ids.append((game_id, str(e)))
print(message)
continue
except Exception as e:
print(f"Critical error while downloading {label}: {e}")
failed_game_ids.append((game_id, str(e)))
traceback.print_exc()
print(message)
continue
successful_titles = {}
for game_id, file_name in jobs_successful:
if game_id not in successful_titles:
successful_titles[game_id] = [file_name]
if any(successful_titles):
print(f"\nAll done, downloaded files for {len(successful_titles)} title(s):")
for game_id, files in successful_titles.items():
print(f"{game_id_to_meta[game_id][0]}, {len(files)} file(s)")
if any(jobs_failed):
print(f"\nDownloads failed for {len(jobs_failed)} file(s):")
for game_id, file_name, message in jobs_failed:
title, url = game_id_to_meta[game_id]
print(f"{title} - {file_name} - {message}")
print(f"Title URL: {url}")
if any(failed_game_ids):
print(f"\nCompletely failed downloads for {len(failed_game_ids)} titles:")
for game_id, message in failed_game_ids:
title, url = game_id_to_meta[game_id]
print(f"{title} ({game_id}) - {url} - {message}")
# ------------------------------
# --- OLD STUFF --- CUT HERE ---
# ------------------------------
class GameAuthor(TypedDict, total=False):
name: str
url: str
class GameMetadata(TypedDict, total=False):
description: str
class GameDownloadJob(TypedDict, total=False):
url: str
game_id: int
title: str
author: GameAuthor
metadata: GameMetadata
class GameDownloader:
def __init__(self, download_to: str, api_key: str, keys: Dict[int, str]):
self.download_to = download_to
self.download_keys = keys
self.client = ItchApiClient(api_key)
def download(self, url: str):
job = GameDownloadJob(url=url)
raise NotImplementedError("Not yet!")
def drive_downloads(jobs: List[str], download_to: str, api_key: str, keys: Dict[int, str], parallel: int = 1):
downloader = GameDownloader(download_to, api_key, keys)
if parallel > 1:
thread_map(downloader.download, jobs, max_workers=parallel, )
else:
for job in tqdm(jobs):
downloader.download(job)

218
itch_dl/handlers.py Normal file
View File

@@ -0,0 +1,218 @@
import re
import json
import os.path
import logging
import urllib.parse
from typing import List, Optional
from bs4 import BeautifulSoup
from .api import ItchApiClient
from .consts import ITCH_BASE, ITCH_URL, ITCH_BROWSER_TYPES, ItchDownloadError
def get_jobs_for_game_jam_json(game_jam_json: dict) -> List[str]:
if 'jam_games' not in game_jam_json:
raise Exception("Provided JSON is not a valid itch.io jam JSON.")
return [g['game']['url'] for g in game_jam_json['jam_games']]
def get_int_after_marker_in_json(text: str, marker: str, key: str) -> Optional[int]:
"""
Many itch.io sites use a pattern like this: Most of the HTML page
is prerendered, but certain interactive objects are handled with
JavaScript initialized with `I.WidgetHandler({"id": 123, ...})`
somewhere near the end of each page. Those config blocks often
contain metadata like game/page IDs that we want to extract.
"""
marker_line: Optional[str] = None
for line in reversed(text.splitlines()):
marker_index = line.find(marker)
if marker_index != -1:
marker_line = line[marker_index:]
break
if marker_line is None:
return None
# Notice double-slashes in the f-string (not r-string)!
pattern = f'\\"{key}\\":\\s?(\\d+)'
found_ints = re.findall(pattern, marker_line)
if len(found_ints) != 1:
return None
return int(found_ints[0])
def get_game_jam_json(jam_url: str, client: ItchApiClient) -> dict:
r = client.get(jam_url)
if not r.ok:
raise ItchDownloadError(f"Could not download the game jam site: {r.status_code} {r.reason}")
jam_id: Optional[int] = get_int_after_marker_in_json(r.text, "I.ViewJam", "id")
if jam_id is None:
raise ItchDownloadError("Provided site did not contain the Game Jam ID. Provide "
"the path to the game jam entries JSON file instead, or "
"create an itch-dl issue with the Game Jam URL.")
logging.info(f"Extracted Game Jam ID: {jam_id}")
r = client.get(f"{ITCH_URL}/jam/{jam_id}/entries.json")
if not r.ok:
raise ItchDownloadError(f"Could not download the game jam entries list: {r.status_code} {r.reason}")
return r.json()
def get_jobs_for_browse_url(url: str, client: ItchApiClient) -> List[str]:
"""
Every browser page has a hidden RSS feed that can be accessed by
appending .xml to its URL. An optional "page" argument lets us
iterate over their contents. When no more elements are available,
the last returned <channel> has no <item> children.
The input URL is cleaned in the main URL handler, so append the
.xml?page=N suffix and iterate until we've caught 'em all.
"""
page = 1
found_urls = set()
logging.info(f"Scraping game URLs from RSS feeds for %s", url)
while True:
logging.info(f"Downloading page {page} (found {len(found_urls)} URLs total)")
r = client.get(f"{url}.xml?page={page}", append_api_key=False)
if not r.ok:
logging.info("RSS feed returned %s, finished.", r.reason)
break
soup = BeautifulSoup(r.text, features="xml")
rss_items = soup.find_all("item")
if len(rss_items) < 1:
logging.info("No more items, finished.")
break
logging.info(f"Found {len(rss_items)} items.")
for item in rss_items:
link_node = item.find("link")
if link_node is None:
continue
node_url = link_node.text.strip()
if len(node_url) > 0:
found_urls.add(node_url)
page += 1
if len(found_urls) == 0:
raise ItchDownloadError("No game URLs found to download.")
return list(found_urls)
def get_jobs_for_itch_url(url: str, client: ItchApiClient) -> List[str]:
if url.startswith("http://"):
logging.info("HTTP link provided, upgrading to HTTPS")
url = "https://" + url[7:]
if url.startswith(f"https://www.{ITCH_BASE}/"):
logging.info(f"Correcting www.{ITCH_BASE} to {ITCH_BASE}")
url = ITCH_URL + '/' + url[20:]
url_parts = urllib.parse.urlparse(url)
url_path_parts: List[str] = [x for x in str(url_parts.path).split('/') if len(x) > 0]
if url_parts.netloc == ITCH_BASE:
if len(url_path_parts) == 0:
raise NotImplementedError("itch-dl cannot download the entirety of itch.io.")
# (yet) (also leafo would not be happy with the bandwidth bill)
site = url_path_parts[0]
if site == "jam": # Game jams
if len(url_path_parts) < 2:
raise ValueError(f"Incomplete game jam URL: {url}")
logging.info("Fetching Game Jam JSON...")
clean_game_jam_url = f"{ITCH_URL}/jam/{url_path_parts[1]}"
game_jam_json = get_game_jam_json(clean_game_jam_url, client)
return get_jobs_for_game_jam_json(game_jam_json)
elif site in ITCH_BROWSER_TYPES: # Browser
clean_browse_url = '/'.join([ITCH_URL, *url_path_parts])
return get_jobs_for_browse_url(clean_browse_url, client)
elif site in ("b", "bundle"): # Bundles
raise NotImplementedError("itch-dl cannot download bundles yet.")
elif site in ("j", "jobs"): # Jobs...
raise ValueError("itch-dl cannot download a job.")
elif site in ("t", "board", "community"): # Forums
raise ValueError("itch-dl cannot download forums.")
elif site == "profile": # Forum Profile
if len(url_path_parts) >= 2:
username = url_path_parts[1]
logging.info("Correcting user profile to creator page for %s", username)
return get_jobs_for_itch_url(f"https://{username}.{ITCH_BASE}", client)
raise ValueError("itch-dl expects a username in profile links.")
# Something else?
raise NotImplementedError(f"itch-dl does not understand \"{site}\" URLs. Please file a new issue.")
elif url_parts.netloc.endswith(f".{ITCH_BASE}"):
if len(url_path_parts) == 0: # Author
# TODO: Find I.UserPage, regex for "user_id": [0-9]+, find the responsible API?
raise NotImplementedError("itch-dl cannot download author pages yet.")
else: # Single game
# Just clean and return the URL:
return [f"https://{url_parts.netloc}/{url_path_parts[0]}"]
else:
raise ValueError(f"Unknown domain: {url_parts.netloc}")
def get_jobs_for_path(path: str) -> List[str]:
try: # Game Jam Entries JSON?
with open(path) as f:
json_data = json.load(f)
if not isinstance(json_data, dict):
raise ValueError(f"File does not contain a JSON dict: {path}")
if 'jam_games' in json_data:
logging.info("Parsing provided file as a Game Jam Entries JSON...")
return get_jobs_for_game_jam_json(json_data)
except json.JSONDecodeError:
pass # Not a valid JSON, okay...
url_list = []
with open(path) as f: # Plain job list?
for line in f:
line = line.strip()
if line.startswith("https://") or line.startswith("http://"):
url_list.append(line)
if len(url_list) > 0:
logging.info("Parsing provided file as a list of URLs to fetch...")
return url_list
raise ValueError(f"File format is unknown - cannot read URLs to download.")
def get_jobs_for_url_or_path(path_or_url: str, api_key: str) -> List[str]:
"""Returns a list of Game URLs for a given itch.io URL or file."""
path_or_url = path_or_url.strip()
if path_or_url.startswith("http://"):
logging.info("HTTP link provided, upgrading to HTTPS")
path_or_url = "https://" + path_or_url[7:]
if path_or_url.startswith("https://"):
client = ItchApiClient(api_key)
return get_jobs_for_itch_url(path_or_url, client)
elif os.path.isfile(path_or_url):
return get_jobs_for_path(path_or_url)

31
itch_dl/keys.py Normal file
View File

@@ -0,0 +1,31 @@
import logging
from typing import Dict
from .api import ItchApiClient
def get_download_keys(client: ItchApiClient) -> Dict[int, str]:
logging.info("Fetching all download keys...")
download_keys = {}
page = 1
while True:
logging.info(f"Downloading page {page} (found {len(download_keys)} keys total)")
r = client.get("/profile/owned-keys", data={"page": page}, timeout=15)
if not r.ok:
break
data = r.json()
if 'owned_keys' not in data:
break # Assuming we're out of keys already...
for key in data['owned_keys']:
download_keys[key['game_id']] = key['id']
if len(data['owned_keys']) == data['per_page']:
page += 1
else:
break
logging.info(f"Fetched {len(download_keys)} download keys.")
return download_keys