forked from Mirrors/itch-dl
Trial The Third: Start rewriting the thing
Wooo, someone wants to use this! Let's make it less embarrassing.
This commit is contained in:
218
itch_dl/handlers.py
Normal file
218
itch_dl/handlers.py
Normal file
@@ -0,0 +1,218 @@
|
||||
import re
|
||||
import json
|
||||
import os.path
|
||||
import logging
|
||||
import urllib.parse
|
||||
from typing import List, Optional
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .api import ItchApiClient
|
||||
from .consts import ITCH_BASE, ITCH_URL, ITCH_BROWSER_TYPES, ItchDownloadError
|
||||
|
||||
|
||||
def get_jobs_for_game_jam_json(game_jam_json: dict) -> List[str]:
|
||||
if 'jam_games' not in game_jam_json:
|
||||
raise Exception("Provided JSON is not a valid itch.io jam JSON.")
|
||||
|
||||
return [g['game']['url'] for g in game_jam_json['jam_games']]
|
||||
|
||||
|
||||
def get_int_after_marker_in_json(text: str, marker: str, key: str) -> Optional[int]:
|
||||
"""
|
||||
Many itch.io sites use a pattern like this: Most of the HTML page
|
||||
is prerendered, but certain interactive objects are handled with
|
||||
JavaScript initialized with `I.WidgetHandler({"id": 123, ...})`
|
||||
somewhere near the end of each page. Those config blocks often
|
||||
contain metadata like game/page IDs that we want to extract.
|
||||
"""
|
||||
marker_line: Optional[str] = None
|
||||
for line in reversed(text.splitlines()):
|
||||
marker_index = line.find(marker)
|
||||
if marker_index != -1:
|
||||
marker_line = line[marker_index:]
|
||||
break
|
||||
|
||||
if marker_line is None:
|
||||
return None
|
||||
|
||||
# Notice double-slashes in the f-string (not r-string)!
|
||||
pattern = f'\\"{key}\\":\\s?(\\d+)'
|
||||
|
||||
found_ints = re.findall(pattern, marker_line)
|
||||
if len(found_ints) != 1:
|
||||
return None
|
||||
|
||||
return int(found_ints[0])
|
||||
|
||||
|
||||
def get_game_jam_json(jam_url: str, client: ItchApiClient) -> dict:
|
||||
r = client.get(jam_url)
|
||||
if not r.ok:
|
||||
raise ItchDownloadError(f"Could not download the game jam site: {r.status_code} {r.reason}")
|
||||
|
||||
jam_id: Optional[int] = get_int_after_marker_in_json(r.text, "I.ViewJam", "id")
|
||||
if jam_id is None:
|
||||
raise ItchDownloadError("Provided site did not contain the Game Jam ID. Provide "
|
||||
"the path to the game jam entries JSON file instead, or "
|
||||
"create an itch-dl issue with the Game Jam URL.")
|
||||
|
||||
logging.info(f"Extracted Game Jam ID: {jam_id}")
|
||||
r = client.get(f"{ITCH_URL}/jam/{jam_id}/entries.json")
|
||||
if not r.ok:
|
||||
raise ItchDownloadError(f"Could not download the game jam entries list: {r.status_code} {r.reason}")
|
||||
|
||||
return r.json()
|
||||
|
||||
|
||||
def get_jobs_for_browse_url(url: str, client: ItchApiClient) -> List[str]:
|
||||
"""
|
||||
Every browser page has a hidden RSS feed that can be accessed by
|
||||
appending .xml to its URL. An optional "page" argument lets us
|
||||
iterate over their contents. When no more elements are available,
|
||||
the last returned <channel> has no <item> children.
|
||||
|
||||
The input URL is cleaned in the main URL handler, so append the
|
||||
.xml?page=N suffix and iterate until we've caught 'em all.
|
||||
"""
|
||||
page = 1
|
||||
found_urls = set()
|
||||
logging.info(f"Scraping game URLs from RSS feeds for %s", url)
|
||||
|
||||
while True:
|
||||
logging.info(f"Downloading page {page} (found {len(found_urls)} URLs total)")
|
||||
r = client.get(f"{url}.xml?page={page}", append_api_key=False)
|
||||
if not r.ok:
|
||||
logging.info("RSS feed returned %s, finished.", r.reason)
|
||||
break
|
||||
|
||||
soup = BeautifulSoup(r.text, features="xml")
|
||||
rss_items = soup.find_all("item")
|
||||
if len(rss_items) < 1:
|
||||
logging.info("No more items, finished.")
|
||||
break
|
||||
|
||||
logging.info(f"Found {len(rss_items)} items.")
|
||||
for item in rss_items:
|
||||
link_node = item.find("link")
|
||||
if link_node is None:
|
||||
continue
|
||||
|
||||
node_url = link_node.text.strip()
|
||||
if len(node_url) > 0:
|
||||
found_urls.add(node_url)
|
||||
|
||||
page += 1
|
||||
|
||||
if len(found_urls) == 0:
|
||||
raise ItchDownloadError("No game URLs found to download.")
|
||||
|
||||
return list(found_urls)
|
||||
|
||||
|
||||
def get_jobs_for_itch_url(url: str, client: ItchApiClient) -> List[str]:
|
||||
if url.startswith("http://"):
|
||||
logging.info("HTTP link provided, upgrading to HTTPS")
|
||||
url = "https://" + url[7:]
|
||||
|
||||
if url.startswith(f"https://www.{ITCH_BASE}/"):
|
||||
logging.info(f"Correcting www.{ITCH_BASE} to {ITCH_BASE}")
|
||||
url = ITCH_URL + '/' + url[20:]
|
||||
|
||||
url_parts = urllib.parse.urlparse(url)
|
||||
url_path_parts: List[str] = [x for x in str(url_parts.path).split('/') if len(x) > 0]
|
||||
|
||||
if url_parts.netloc == ITCH_BASE:
|
||||
if len(url_path_parts) == 0:
|
||||
raise NotImplementedError("itch-dl cannot download the entirety of itch.io.")
|
||||
# (yet) (also leafo would not be happy with the bandwidth bill)
|
||||
|
||||
site = url_path_parts[0]
|
||||
|
||||
if site == "jam": # Game jams
|
||||
if len(url_path_parts) < 2:
|
||||
raise ValueError(f"Incomplete game jam URL: {url}")
|
||||
|
||||
logging.info("Fetching Game Jam JSON...")
|
||||
clean_game_jam_url = f"{ITCH_URL}/jam/{url_path_parts[1]}"
|
||||
game_jam_json = get_game_jam_json(clean_game_jam_url, client)
|
||||
return get_jobs_for_game_jam_json(game_jam_json)
|
||||
|
||||
elif site in ITCH_BROWSER_TYPES: # Browser
|
||||
clean_browse_url = '/'.join([ITCH_URL, *url_path_parts])
|
||||
return get_jobs_for_browse_url(clean_browse_url, client)
|
||||
|
||||
elif site in ("b", "bundle"): # Bundles
|
||||
raise NotImplementedError("itch-dl cannot download bundles yet.")
|
||||
|
||||
elif site in ("j", "jobs"): # Jobs...
|
||||
raise ValueError("itch-dl cannot download a job.")
|
||||
|
||||
elif site in ("t", "board", "community"): # Forums
|
||||
raise ValueError("itch-dl cannot download forums.")
|
||||
|
||||
elif site == "profile": # Forum Profile
|
||||
if len(url_path_parts) >= 2:
|
||||
username = url_path_parts[1]
|
||||
logging.info("Correcting user profile to creator page for %s", username)
|
||||
return get_jobs_for_itch_url(f"https://{username}.{ITCH_BASE}", client)
|
||||
|
||||
raise ValueError("itch-dl expects a username in profile links.")
|
||||
|
||||
# Something else?
|
||||
raise NotImplementedError(f"itch-dl does not understand \"{site}\" URLs. Please file a new issue.")
|
||||
|
||||
elif url_parts.netloc.endswith(f".{ITCH_BASE}"):
|
||||
if len(url_path_parts) == 0: # Author
|
||||
# TODO: Find I.UserPage, regex for "user_id": [0-9]+, find the responsible API?
|
||||
raise NotImplementedError("itch-dl cannot download author pages yet.")
|
||||
|
||||
else: # Single game
|
||||
# Just clean and return the URL:
|
||||
return [f"https://{url_parts.netloc}/{url_path_parts[0]}"]
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unknown domain: {url_parts.netloc}")
|
||||
|
||||
|
||||
def get_jobs_for_path(path: str) -> List[str]:
|
||||
try: # Game Jam Entries JSON?
|
||||
with open(path) as f:
|
||||
json_data = json.load(f)
|
||||
|
||||
if not isinstance(json_data, dict):
|
||||
raise ValueError(f"File does not contain a JSON dict: {path}")
|
||||
|
||||
if 'jam_games' in json_data:
|
||||
logging.info("Parsing provided file as a Game Jam Entries JSON...")
|
||||
return get_jobs_for_game_jam_json(json_data)
|
||||
except json.JSONDecodeError:
|
||||
pass # Not a valid JSON, okay...
|
||||
|
||||
url_list = []
|
||||
with open(path) as f: # Plain job list?
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line.startswith("https://") or line.startswith("http://"):
|
||||
url_list.append(line)
|
||||
|
||||
if len(url_list) > 0:
|
||||
logging.info("Parsing provided file as a list of URLs to fetch...")
|
||||
return url_list
|
||||
|
||||
raise ValueError(f"File format is unknown - cannot read URLs to download.")
|
||||
|
||||
|
||||
def get_jobs_for_url_or_path(path_or_url: str, api_key: str) -> List[str]:
|
||||
"""Returns a list of Game URLs for a given itch.io URL or file."""
|
||||
path_or_url = path_or_url.strip()
|
||||
|
||||
if path_or_url.startswith("http://"):
|
||||
logging.info("HTTP link provided, upgrading to HTTPS")
|
||||
path_or_url = "https://" + path_or_url[7:]
|
||||
|
||||
if path_or_url.startswith("https://"):
|
||||
client = ItchApiClient(api_key)
|
||||
return get_jobs_for_itch_url(path_or_url, client)
|
||||
elif os.path.isfile(path_or_url):
|
||||
return get_jobs_for_path(path_or_url)
|
||||
Reference in New Issue
Block a user