Trial The Third: Start rewriting the thing

Wooo, someone wants to use this! Let's make it less embarrassing.
2022-05-15 02:02:45 +02:00
parent 00cced1f41
commit 4a8f88b48e
13 changed files with 676 additions and 379 deletions
--- a/itch_dl/handlers.py
+++ b/itch_dl/handlers.py
@@ -0,0 +1,218 @@
+import re
+import json
+import os.path
+import logging
+import urllib.parse
+from typing import List, Optional
+
+from bs4 import BeautifulSoup
+
+from .api import ItchApiClient
+from .consts import ITCH_BASE, ITCH_URL, ITCH_BROWSER_TYPES, ItchDownloadError
+
+
+def get_jobs_for_game_jam_json(game_jam_json: dict) -> List[str]:
+    if 'jam_games' not in game_jam_json:
+        raise Exception("Provided JSON is not a valid itch.io jam JSON.")
+
+    return [g['game']['url'] for g in game_jam_json['jam_games']]
+
+
+def get_int_after_marker_in_json(text: str, marker: str, key: str) -> Optional[int]:
+    """
+    Many itch.io sites use a pattern like this: Most of the HTML page
+    is prerendered, but certain interactive objects are handled with
+    JavaScript initialized with `I.WidgetHandler({"id": 123, ...})`
+    somewhere near the end of each page. Those config blocks often
+    contain metadata like game/page IDs that we want to extract.
+    """
+    marker_line: Optional[str] = None
+    for line in reversed(text.splitlines()):
+        marker_index = line.find(marker)
+        if marker_index != -1:
+            marker_line = line[marker_index:]
+            break
+
+    if marker_line is None:
+        return None
+
+    # Notice double-slashes in the f-string (not r-string)!
+    pattern = f'\\"{key}\\":\\s?(\\d+)'
+
+    found_ints = re.findall(pattern, marker_line)
+    if len(found_ints) != 1:
+        return None
+
+    return int(found_ints[0])
+
+
+def get_game_jam_json(jam_url: str, client: ItchApiClient) -> dict:
+    r = client.get(jam_url)
+    if not r.ok:
+        raise ItchDownloadError(f"Could not download the game jam site: {r.status_code} {r.reason}")
+
+    jam_id: Optional[int] = get_int_after_marker_in_json(r.text, "I.ViewJam", "id")
+    if jam_id is None:
+        raise ItchDownloadError("Provided site did not contain the Game Jam ID. Provide "
+                                "the path to the game jam entries JSON file instead, or "
+                                "create an itch-dl issue with the Game Jam URL.")
+
+    logging.info(f"Extracted Game Jam ID: {jam_id}")
+    r = client.get(f"{ITCH_URL}/jam/{jam_id}/entries.json")
+    if not r.ok:
+        raise ItchDownloadError(f"Could not download the game jam entries list: {r.status_code} {r.reason}")
+
+    return r.json()
+
+
+def get_jobs_for_browse_url(url: str, client: ItchApiClient) -> List[str]:
+    """
+    Every browser page has a hidden RSS feed that can be accessed by
+    appending .xml to its URL. An optional "page" argument lets us
+    iterate over their contents. When no more elements are available,
+    the last returned <channel> has no <item> children.
+
+    The input URL is cleaned in the main URL handler, so append the
+    .xml?page=N suffix and iterate until we've caught 'em all.
+    """
+    page = 1
+    found_urls = set()
+    logging.info(f"Scraping game URLs from RSS feeds for %s", url)
+
+    while True:
+        logging.info(f"Downloading page {page} (found {len(found_urls)} URLs total)")
+        r = client.get(f"{url}.xml?page={page}", append_api_key=False)
+        if not r.ok:
+            logging.info("RSS feed returned %s, finished.", r.reason)
+            break
+
+        soup = BeautifulSoup(r.text, features="xml")
+        rss_items = soup.find_all("item")
+        if len(rss_items) < 1:
+            logging.info("No more items, finished.")
+            break
+
+        logging.info(f"Found {len(rss_items)} items.")
+        for item in rss_items:
+            link_node = item.find("link")
+            if link_node is None:
+                continue
+
+            node_url = link_node.text.strip()
+            if len(node_url) > 0:
+                found_urls.add(node_url)
+
+        page += 1
+
+    if len(found_urls) == 0:
+        raise ItchDownloadError("No game URLs found to download.")
+
+    return list(found_urls)
+
+
+def get_jobs_for_itch_url(url: str, client: ItchApiClient) -> List[str]:
+    if url.startswith("http://"):
+        logging.info("HTTP link provided, upgrading to HTTPS")
+        url = "https://" + url[7:]
+
+    if url.startswith(f"https://www.{ITCH_BASE}/"):
+        logging.info(f"Correcting www.{ITCH_BASE} to {ITCH_BASE}")
+        url = ITCH_URL + '/' + url[20:]
+
+    url_parts = urllib.parse.urlparse(url)
+    url_path_parts: List[str] = [x for x in str(url_parts.path).split('/') if len(x) > 0]
+
+    if url_parts.netloc == ITCH_BASE:
+        if len(url_path_parts) == 0:
+            raise NotImplementedError("itch-dl cannot download the entirety of itch.io.")
+        # (yet) (also leafo would not be happy with the bandwidth bill)
+
+        site = url_path_parts[0]
+
+        if site == "jam":  # Game jams
+            if len(url_path_parts) < 2:
+                raise ValueError(f"Incomplete game jam URL: {url}")
+
+            logging.info("Fetching Game Jam JSON...")
+            clean_game_jam_url = f"{ITCH_URL}/jam/{url_path_parts[1]}"
+            game_jam_json = get_game_jam_json(clean_game_jam_url, client)
+            return get_jobs_for_game_jam_json(game_jam_json)
+
+        elif site in ITCH_BROWSER_TYPES:  # Browser
+            clean_browse_url = '/'.join([ITCH_URL, *url_path_parts])
+            return get_jobs_for_browse_url(clean_browse_url, client)
+
+        elif site in ("b", "bundle"):  # Bundles
+            raise NotImplementedError("itch-dl cannot download bundles yet.")
+
+        elif site in ("j", "jobs"):  # Jobs...
+            raise ValueError("itch-dl cannot download a job.")
+
+        elif site in ("t", "board", "community"):  # Forums
+            raise ValueError("itch-dl cannot download forums.")
+
+        elif site == "profile":  # Forum Profile
+            if len(url_path_parts) >= 2:
+                username = url_path_parts[1]
+                logging.info("Correcting user profile to creator page for %s", username)
+                return get_jobs_for_itch_url(f"https://{username}.{ITCH_BASE}", client)
+
+            raise ValueError("itch-dl expects a username in profile links.")
+
+        # Something else?
+        raise NotImplementedError(f"itch-dl does not understand \"{site}\" URLs. Please file a new issue.")
+
+    elif url_parts.netloc.endswith(f".{ITCH_BASE}"):
+        if len(url_path_parts) == 0:  # Author
+            # TODO: Find I.UserPage, regex for "user_id": [0-9]+, find the responsible API?
+            raise NotImplementedError("itch-dl cannot download author pages yet.")
+
+        else:  # Single game
+            # Just clean and return the URL:
+            return [f"https://{url_parts.netloc}/{url_path_parts[0]}"]
+
+    else:
+        raise ValueError(f"Unknown domain: {url_parts.netloc}")
+
+
+def get_jobs_for_path(path: str) -> List[str]:
+    try:  # Game Jam Entries JSON?
+        with open(path) as f:
+            json_data = json.load(f)
+
+        if not isinstance(json_data, dict):
+            raise ValueError(f"File does not contain a JSON dict: {path}")
+
+        if 'jam_games' in json_data:
+            logging.info("Parsing provided file as a Game Jam Entries JSON...")
+            return get_jobs_for_game_jam_json(json_data)
+    except json.JSONDecodeError:
+        pass  # Not a valid JSON, okay...
+
+    url_list = []
+    with open(path) as f:  # Plain job list?
+        for line in f:
+            line = line.strip()
+            if line.startswith("https://") or line.startswith("http://"):
+                url_list.append(line)
+
+    if len(url_list) > 0:
+        logging.info("Parsing provided file as a list of URLs to fetch...")
+        return url_list
+
+    raise ValueError(f"File format is unknown - cannot read URLs to download.")
+
+
+def get_jobs_for_url_or_path(path_or_url: str, api_key: str) -> List[str]:
+    """Returns a list of Game URLs for a given itch.io URL or file."""
+    path_or_url = path_or_url.strip()
+
+    if path_or_url.startswith("http://"):
+        logging.info("HTTP link provided, upgrading to HTTPS")
+        path_or_url = "https://" + path_or_url[7:]
+
+    if path_or_url.startswith("https://"):
+        client = ItchApiClient(api_key)
+        return get_jobs_for_itch_url(path_or_url, client)
+    elif os.path.isfile(path_or_url):
+        return get_jobs_for_path(path_or_url)