Trial The Third: Start rewriting the thing

Wooo, someone wants to use this! Let's make it less embarrassing.
2022-05-15 02:02:45 +02:00
parent 00cced1f41
commit 4a8f88b48e
13 changed files with 676 additions and 379 deletions
--- a/itch_dl/init.py
+++ b/itch_dl/init.py
@@ -0,0 +1 @@
+__version__ = '0.1.0'
--- a/itch_dl/main.py
+++ b/itch_dl/main.py
@@ -0,0 +1,3 @@
+#!/usr/bin/env python3
+from itch_dl.cli import run
+run()
--- a/itch_dl/api.py
+++ b/itch_dl/api.py
@@ -0,0 +1,43 @@
+from typing import Optional
+
+from requests import Session
+from urllib3.util.retry import Retry
+from requests.adapters import HTTPAdapter
+
+from .consts import ITCH_API
+
+
+class ItchApiClient:
+    def __init__(self, api_key: str, base_url: Optional[str] = None):
+        self.base_url = base_url or ITCH_API
+        self.api_key = api_key
+
+        self.requests = Session()
+
+        retry_strategy = Retry(
+            total=5,
+            backoff_factor=10,
+            allowed_methods=["HEAD", "GET"],
+            status_forcelist=[429, 500, 502, 503, 504]
+        )
+
+        # No timeouts - set them explicitly on API calls below!
+        adapter = HTTPAdapter(max_retries=retry_strategy)
+        self.requests.mount("https://", adapter)
+        self.requests.mount("http://", adapter)
+
+    def get(self, endpoint: str, append_api_key: bool = True, **kwargs):
+        if append_api_key:
+            params = kwargs.get('data') or {}
+
+            if 'api_key' not in params:
+                params['api_key'] = self.api_key
+
+            kwargs['data'] = params
+
+        if endpoint.startswith("https://"):
+            url = endpoint
+        else:
+            url = self.base_url + endpoint
+
+        return self.requests.get(url, **kwargs)
--- a/itch_dl/cli.py
+++ b/itch_dl/cli.py
@@ -0,0 +1,67 @@
+import os
+import logging
+import argparse
+
+from .handlers import get_jobs_for_url_or_path
+from .downloader import drive_downloads
+from .keys import get_download_keys
+from .api import ItchApiClient
+logging.basicConfig()
+logging.getLogger().setLevel(logging.INFO)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Bulk download stuff from Itch.io.")
+    parser.add_argument("url_or_path",
+                        help="itch.io URL or path to a game jam entries.json file")
+    parser.add_argument("--api-key", metavar="key", required=True,
+                        help="itch.io API key - https://itch.io/user/settings/api-keys")
+    parser.add_argument("--urls-only", action="store_true",
+                        help="print scraped game URLs without downloading them")
+    parser.add_argument("--download-to", metavar="path",
+                        help="directory to save results into (default: current dir)")
+    parser.add_argument("--parallel", metavar="parallel", type=int, default=1,
+                        help="how many threads to use for downloading games (default: 1)")
+    parser.add_argument("--mirror-web", action="store_true",
+                        help="try to fetch assets on game sites")
+    parser.add_argument("--verbose", action="store_true",
+                        help="print verbose logs")
+    return parser.parse_args()
+
+
+def run() -> int:
+    args = parse_args()
+    if args.verbose:
+        logging.getLogger().setLevel(logging.DEBUG)
+
+    jobs = get_jobs_for_url_or_path(args.url_or_path, args.api_key)
+    jobs = list(set(jobs))  # Deduplicate, just in case...
+    logging.info(f"Found {len(jobs)} URL(s).")
+
+    if len(jobs) == 0:
+        print("No URLs to download.")
+        return 1
+
+    if args.urls_only:
+        for job in jobs:
+            print(job)
+
+        return 0
+
+    download_to = os.getcwd()
+    if args.download_to is not None:
+        download_to = os.path.normpath(args.download_to)
+        os.makedirs(download_to, exist_ok=True)
+
+    client = ItchApiClient(args.api_key)
+
+    # Check API key validity:
+    profile_req = client.get("/profile")
+    if not profile_req.ok:
+        print(f"Provided API key appears to be invalid: {profile_req.text}")
+        exit(1)
+
+    # Grab all the download keys (there's no way to fetch them per title...):
+    keys = get_download_keys(client)
+
+    return drive_downloads(jobs, download_to, args.api_key, keys, parallel=args.parallel)
--- a/itch_dl/consts.py
+++ b/itch_dl/consts.py
@@ -0,0 +1,29 @@
+from enum import Enum
+
+ITCH_BASE = "itch.io"
+ITCH_URL = f"https://{ITCH_BASE}"
+ITCH_API = f"https://api.{ITCH_BASE}"
+
+ITCH_BROWSER_TYPES = [
+    "games",
+    "tools",
+    "game-assets",
+    "comics",
+    "books",
+    "physical-games",
+    "soundtracks",
+    "game-mods",
+    "misc",
+]
+
+
+class ItchDownloadResult(Enum):
+    SUCCESS = 0
+    FAILURE = 1
+    MISSING_DOWNLOAD = 2
+    DOWNLOAD_TIMEOUT = 3
+
+
+# I mean, not really a const but eh
+class ItchDownloadError(Exception):
+    pass
--- a/itch_dl/downloader.py
+++ b/itch_dl/downloader.py
@@ -0,0 +1,251 @@
+import os
+import shutil
+import logging
+import traceback
+import subprocess
+from typing import Tuple, List, Dict, TypedDict, Optional
+
+from slugify import slugify
+from requests.exceptions import HTTPError
+
+from tqdm import tqdm
+from tqdm.contrib.concurrent import thread_map
+
+from .api import ItchApiClient
+from .consts import ItchDownloadError, ItchDownloadResult
+
+
+# ------------------------------
+# --- OLD STUFF --- CUT HERE ---
+# ------------------------------
+
+
+WGET_PATH = shutil.which("wget")
+if WGET_PATH is None:
+    print(f"Warning: wget not available, site mirroring will not work!")
+
+
+def download_file(client: ItchApiClient, upload_id: int, download_path: str, creds: dict, print_url: bool=False):
+    # No timeouts, chunked uploads, default retry strategy, should be all good?
+    try:
+        with client.get(f"/uploads/{upload_id}/download", data=creds, stream=True) as r:
+            r.raise_for_status()
+            if print_url:
+                print(f"Download URL: {r.url}")
+
+            with open(download_path, 'wb') as f:
+                for chunk in r.iter_content(chunk_size=1048576):  # 1MB chunks
+                    f.write(chunk)
+    except HTTPError as e:
+        raise ItchDownloadError(f"Unrecoverable download error: {e}")
+
+
+def get_meta_for_game_url(game_url: str) -> Tuple[int, str]:
+    """Finds the Game ID and Title for a Game URL."""
+    data_url = game_url.rstrip("/") + "/data.json"
+    data_req = requests.get(data_url)
+    r.raise_for_status()
+
+    data_json = data_req.json()
+    if not 'id' in data_json:
+        raise ItchDownloadError(f"Cannot fetch the Game ID for URL: {game_url}")
+
+    return data_json['id']
+
+
+
+
+
+
+
+def download_jam(jam_path: str, download_to: str, api_key: str, continue_from: str=None):
+    client = ItchApiClient(api_key)
+    jam_json = get_game_jam_json(jam_path)
+
+    # Check API key validity:
+    profile_req = client.get("/profile")
+    if not profile_req.ok:
+        print(f"Provided API key appears to be invalid: {profile_req.text}")
+        exit(1)
+
+    jobs = parse_jobs(jam_json)
+    jobs_successful = []
+    jobs_failed = []
+
+    game_id_to_meta = {}  # dict[game_id: int, (title: str, url: str)]
+
+    for game_id, title, url in jobs:
+        game_id_to_meta[game_id] = (title, url)
+
+    failed_game_ids = set()
+
+    # No "continue from"? Yep, start right away.
+    should_process_jobs = continue_from is None
+
+    for game_id, title, url in jobs:
+        label = f"{title} ({game_id})"
+        if not should_process_jobs:
+            if game_id == continue_from:
+                should_process_jobs = True
+            else:
+                continue
+
+        try:
+            download_path = os.path.join(download_to, slugify(title))
+            if PEDANTIC_MIRRORING:
+                site_mirror_path = os.path.join(download_to, "_sites")
+            else:
+                site_mirror_path = os.path.join(download_path, "site")
+            os.makedirs(download_path, exist_ok=True)
+            os.makedirs(site_mirror_path, exist_ok=True)
+        except:
+            raise ItchDownloadError(f"Could not create download directory: {download_path}")
+
+        print(f"Trying to download {label} to {download_path}")
+
+        if WGET_PATH is not None:
+            print("Downloading site...")
+            if PEDANTIC_MIRRORING:
+                extra_wget_args = [
+                    "--timestamping",
+                    "--span-hosts",
+                    "--convert-links",
+                    "--adjust-extension",
+                    "--page-requisites",
+                ]
+            else:
+                extra_wget_args = []
+
+            wget = subprocess.run([
+                WGET_PATH,
+                *extra_wget_args,
+                "--quiet",
+                url
+            ], cwd=site_mirror_path)
+
+            if wget.returncode != 0:
+                print(f"Warning: Site mirroring failed/incomplete.")
+
+        creds = {}
+        if game_id in self.download_keys:
+            creds['download_key_id'] = self.download_keys[game_id]
+            print("Using {creds} for private uploads")
+
+        game_uploads_req = client.get(f"/games/{game_id}/uploads", data=creds, timeout=15)
+        if not game_uploads_req.ok:
+            raise ItchDownloadError(f"Could not fetch game uploads for {label}: {game_uploads_req.text}")
+
+        game_uploads = game_uploads_req.json()['uploads']
+        print(f"Found {len(game_uploads)} upload(s)")
+
+        try:
+            for upload in game_uploads:
+                upload_id = upload['id']
+                file_name = upload['filename']
+                file_size = upload['size']
+                upload_is_external = upload['storage'] == 'external'
+
+                print(f"Downloading '{file_name}' ({upload_id}), {file_size} bytes...")
+                if upload_is_external:
+                    print("***********************************************************")
+                    print("*                                                         *")
+                    print("* WARNING: External storage - downloads will likely fail. *")
+                    print("*         Check the URL displayed below manually!         *")
+                    print("*                                                         *")
+                    print("***********************************************************")
+
+                target_path = os.path.join(download_path, file_name)
+                try:
+                    download_file(client, upload_id, target_path, creds, print_url=upload_is_external)
+                except ItchDownloadError as e:
+                    jobs_failed.append((game_id, file_name, str(e)))
+                    print(f"Download failed for {file_name}: {e}")
+                    continue
+
+                try:
+                    actual_file_size = os.stat(target_path).st_size
+                    if actual_file_size == file_size:
+                        jobs_successful.append((game_id, file_name))
+                    else:
+                        jobs_failed.append((game_id, file_name, f"File size is {actual_file_size}, expected {file_size}"))
+                except FileNotFoundError:
+                    jobs_failed.append((game_id, file_name, "Could not download file"))
+
+            print(f"Done downloading {label}")
+        except ItchDownloadError as e:
+            failed_game_ids.append((game_id, str(e)))
+            print(message)
+            continue
+        except Exception as e:
+            print(f"Critical error while downloading {label}: {e}")
+            failed_game_ids.append((game_id, str(e)))
+            traceback.print_exc()
+            print(message)
+            continue
+
+    successful_titles = {}
+    for game_id, file_name in jobs_successful:
+        if game_id not in successful_titles:
+            successful_titles[game_id] = [file_name]
+
+    if any(successful_titles):
+        print(f"\nAll done, downloaded files for {len(successful_titles)} title(s):")
+        for game_id, files in successful_titles.items():
+            print(f"{game_id_to_meta[game_id][0]}, {len(files)} file(s)")
+
+    if any(jobs_failed):
+        print(f"\nDownloads failed for {len(jobs_failed)} file(s):")
+        for game_id, file_name, message in jobs_failed:
+            title, url = game_id_to_meta[game_id]
+            print(f"{title} - {file_name} - {message}")
+            print(f"Title URL: {url}")
+
+    if any(failed_game_ids):
+        print(f"\nCompletely failed downloads for {len(failed_game_ids)} titles:")
+        for game_id, message in failed_game_ids:
+            title, url = game_id_to_meta[game_id]
+            print(f"{title} ({game_id}) - {url} - {message}")
+
+
+# ------------------------------
+# --- OLD STUFF --- CUT HERE ---
+# ------------------------------
+
+
+class GameAuthor(TypedDict, total=False):
+    name: str
+    url: str
+
+
+class GameMetadata(TypedDict, total=False):
+    description: str
+
+
+class GameDownloadJob(TypedDict, total=False):
+    url: str
+    game_id: int
+    title: str
+    author: GameAuthor
+    metadata: GameMetadata
+
+
+class GameDownloader:
+    def __init__(self, download_to: str, api_key: str, keys: Dict[int, str]):
+        self.download_to = download_to
+        self.download_keys = keys
+
+        self.client = ItchApiClient(api_key)
+
+    def download(self, url: str):
+        job = GameDownloadJob(url=url)
+        raise NotImplementedError("Not yet!")
+
+
+def drive_downloads(jobs: List[str], download_to: str, api_key: str, keys: Dict[int, str], parallel: int = 1):
+    downloader = GameDownloader(download_to, api_key, keys)
+
+    if parallel > 1:
+        thread_map(downloader.download, jobs, max_workers=parallel, )
+    else:
+        for job in tqdm(jobs):
+            downloader.download(job)
--- a/itch_dl/handlers.py
+++ b/itch_dl/handlers.py
@@ -0,0 +1,218 @@
+import re
+import json
+import os.path
+import logging
+import urllib.parse
+from typing import List, Optional
+
+from bs4 import BeautifulSoup
+
+from .api import ItchApiClient
+from .consts import ITCH_BASE, ITCH_URL, ITCH_BROWSER_TYPES, ItchDownloadError
+
+
+def get_jobs_for_game_jam_json(game_jam_json: dict) -> List[str]:
+    if 'jam_games' not in game_jam_json:
+        raise Exception("Provided JSON is not a valid itch.io jam JSON.")
+
+    return [g['game']['url'] for g in game_jam_json['jam_games']]
+
+
+def get_int_after_marker_in_json(text: str, marker: str, key: str) -> Optional[int]:
+    """
+    Many itch.io sites use a pattern like this: Most of the HTML page
+    is prerendered, but certain interactive objects are handled with
+    JavaScript initialized with `I.WidgetHandler({"id": 123, ...})`
+    somewhere near the end of each page. Those config blocks often
+    contain metadata like game/page IDs that we want to extract.
+    """
+    marker_line: Optional[str] = None
+    for line in reversed(text.splitlines()):
+        marker_index = line.find(marker)
+        if marker_index != -1:
+            marker_line = line[marker_index:]
+            break
+
+    if marker_line is None:
+        return None
+
+    # Notice double-slashes in the f-string (not r-string)!
+    pattern = f'\\"{key}\\":\\s?(\\d+)'
+
+    found_ints = re.findall(pattern, marker_line)
+    if len(found_ints) != 1:
+        return None
+
+    return int(found_ints[0])
+
+
+def get_game_jam_json(jam_url: str, client: ItchApiClient) -> dict:
+    r = client.get(jam_url)
+    if not r.ok:
+        raise ItchDownloadError(f"Could not download the game jam site: {r.status_code} {r.reason}")
+
+    jam_id: Optional[int] = get_int_after_marker_in_json(r.text, "I.ViewJam", "id")
+    if jam_id is None:
+        raise ItchDownloadError("Provided site did not contain the Game Jam ID. Provide "
+                                "the path to the game jam entries JSON file instead, or "
+                                "create an itch-dl issue with the Game Jam URL.")
+
+    logging.info(f"Extracted Game Jam ID: {jam_id}")
+    r = client.get(f"{ITCH_URL}/jam/{jam_id}/entries.json")
+    if not r.ok:
+        raise ItchDownloadError(f"Could not download the game jam entries list: {r.status_code} {r.reason}")
+
+    return r.json()
+
+
+def get_jobs_for_browse_url(url: str, client: ItchApiClient) -> List[str]:
+    """
+    Every browser page has a hidden RSS feed that can be accessed by
+    appending .xml to its URL. An optional "page" argument lets us
+    iterate over their contents. When no more elements are available,
+    the last returned <channel> has no <item> children.
+
+    The input URL is cleaned in the main URL handler, so append the
+    .xml?page=N suffix and iterate until we've caught 'em all.
+    """
+    page = 1
+    found_urls = set()
+    logging.info(f"Scraping game URLs from RSS feeds for %s", url)
+
+    while True:
+        logging.info(f"Downloading page {page} (found {len(found_urls)} URLs total)")
+        r = client.get(f"{url}.xml?page={page}", append_api_key=False)
+        if not r.ok:
+            logging.info("RSS feed returned %s, finished.", r.reason)
+            break
+
+        soup = BeautifulSoup(r.text, features="xml")
+        rss_items = soup.find_all("item")
+        if len(rss_items) < 1:
+            logging.info("No more items, finished.")
+            break
+
+        logging.info(f"Found {len(rss_items)} items.")
+        for item in rss_items:
+            link_node = item.find("link")
+            if link_node is None:
+                continue
+
+            node_url = link_node.text.strip()
+            if len(node_url) > 0:
+                found_urls.add(node_url)
+
+        page += 1
+
+    if len(found_urls) == 0:
+        raise ItchDownloadError("No game URLs found to download.")
+
+    return list(found_urls)
+
+
+def get_jobs_for_itch_url(url: str, client: ItchApiClient) -> List[str]:
+    if url.startswith("http://"):
+        logging.info("HTTP link provided, upgrading to HTTPS")
+        url = "https://" + url[7:]
+
+    if url.startswith(f"https://www.{ITCH_BASE}/"):
+        logging.info(f"Correcting www.{ITCH_BASE} to {ITCH_BASE}")
+        url = ITCH_URL + '/' + url[20:]
+
+    url_parts = urllib.parse.urlparse(url)
+    url_path_parts: List[str] = [x for x in str(url_parts.path).split('/') if len(x) > 0]
+
+    if url_parts.netloc == ITCH_BASE:
+        if len(url_path_parts) == 0:
+            raise NotImplementedError("itch-dl cannot download the entirety of itch.io.")
+        # (yet) (also leafo would not be happy with the bandwidth bill)
+
+        site = url_path_parts[0]
+
+        if site == "jam":  # Game jams
+            if len(url_path_parts) < 2:
+                raise ValueError(f"Incomplete game jam URL: {url}")
+
+            logging.info("Fetching Game Jam JSON...")
+            clean_game_jam_url = f"{ITCH_URL}/jam/{url_path_parts[1]}"
+            game_jam_json = get_game_jam_json(clean_game_jam_url, client)
+            return get_jobs_for_game_jam_json(game_jam_json)
+
+        elif site in ITCH_BROWSER_TYPES:  # Browser
+            clean_browse_url = '/'.join([ITCH_URL, *url_path_parts])
+            return get_jobs_for_browse_url(clean_browse_url, client)
+
+        elif site in ("b", "bundle"):  # Bundles
+            raise NotImplementedError("itch-dl cannot download bundles yet.")
+
+        elif site in ("j", "jobs"):  # Jobs...
+            raise ValueError("itch-dl cannot download a job.")
+
+        elif site in ("t", "board", "community"):  # Forums
+            raise ValueError("itch-dl cannot download forums.")
+
+        elif site == "profile":  # Forum Profile
+            if len(url_path_parts) >= 2:
+                username = url_path_parts[1]
+                logging.info("Correcting user profile to creator page for %s", username)
+                return get_jobs_for_itch_url(f"https://{username}.{ITCH_BASE}", client)
+
+            raise ValueError("itch-dl expects a username in profile links.")
+
+        # Something else?
+        raise NotImplementedError(f"itch-dl does not understand \"{site}\" URLs. Please file a new issue.")
+
+    elif url_parts.netloc.endswith(f".{ITCH_BASE}"):
+        if len(url_path_parts) == 0:  # Author
+            # TODO: Find I.UserPage, regex for "user_id": [0-9]+, find the responsible API?
+            raise NotImplementedError("itch-dl cannot download author pages yet.")
+
+        else:  # Single game
+            # Just clean and return the URL:
+            return [f"https://{url_parts.netloc}/{url_path_parts[0]}"]
+
+    else:
+        raise ValueError(f"Unknown domain: {url_parts.netloc}")
+
+
+def get_jobs_for_path(path: str) -> List[str]:
+    try:  # Game Jam Entries JSON?
+        with open(path) as f:
+            json_data = json.load(f)
+
+        if not isinstance(json_data, dict):
+            raise ValueError(f"File does not contain a JSON dict: {path}")
+
+        if 'jam_games' in json_data:
+            logging.info("Parsing provided file as a Game Jam Entries JSON...")
+            return get_jobs_for_game_jam_json(json_data)
+    except json.JSONDecodeError:
+        pass  # Not a valid JSON, okay...
+
+    url_list = []
+    with open(path) as f:  # Plain job list?
+        for line in f:
+            line = line.strip()
+            if line.startswith("https://") or line.startswith("http://"):
+                url_list.append(line)
+
+    if len(url_list) > 0:
+        logging.info("Parsing provided file as a list of URLs to fetch...")
+        return url_list
+
+    raise ValueError(f"File format is unknown - cannot read URLs to download.")
+
+
+def get_jobs_for_url_or_path(path_or_url: str, api_key: str) -> List[str]:
+    """Returns a list of Game URLs for a given itch.io URL or file."""
+    path_or_url = path_or_url.strip()
+
+    if path_or_url.startswith("http://"):
+        logging.info("HTTP link provided, upgrading to HTTPS")
+        path_or_url = "https://" + path_or_url[7:]
+
+    if path_or_url.startswith("https://"):
+        client = ItchApiClient(api_key)
+        return get_jobs_for_itch_url(path_or_url, client)
+    elif os.path.isfile(path_or_url):
+        return get_jobs_for_path(path_or_url)
--- a/itch_dl/keys.py
+++ b/itch_dl/keys.py
@@ -0,0 +1,31 @@
+import logging
+from typing import Dict
+
+from .api import ItchApiClient
+
+
+def get_download_keys(client: ItchApiClient) -> Dict[int, str]:
+    logging.info("Fetching all download keys...")
+    download_keys = {}
+    page = 1
+
+    while True:
+        logging.info(f"Downloading page {page} (found {len(download_keys)} keys total)")
+        r = client.get("/profile/owned-keys", data={"page": page}, timeout=15)
+        if not r.ok:
+            break
+
+        data = r.json()
+        if 'owned_keys' not in data:
+            break  # Assuming we're out of keys already...
+
+        for key in data['owned_keys']:
+            download_keys[key['game_id']] = key['id']
+
+        if len(data['owned_keys']) == data['per_page']:
+            page += 1
+        else:
+            break
+
+    logging.info(f"Fetched {len(download_keys)} download keys.")
+    return download_keys