mirror of
https://github.com/DragoonAethis/itch-dl.git
synced 2024-12-20 18:11:52 +01:00
Implement infobox parsing, misc bugfixes, version bump
This commit is contained in:
parent
f5c0f4658d
commit
008e6870e8
14
README.md
14
README.md
@ -5,12 +5,13 @@ Bulk download games from [itch.io](https://itch.io/).
|
||||
- Can download game jams, browse pages (popular, newest, browse by tag...) and individual games.
|
||||
- Requires Python 3.8+, grab it from PyPI: `pip install itch-dl`
|
||||
- For development, use [Poetry](https://python-poetry.org/).
|
||||
- Optionally requires wget for site mirroring.
|
||||
|
||||
How to use this:
|
||||
|
||||
## How to use
|
||||
|
||||
- Log into itch.io with the account you'd like to use for downloading.
|
||||
- Generate a new API key on your user account page: https://itch.io/user/settings/api-keys
|
||||
- Generate [a new API key](https://itch.io/user/settings/api-keys) on your user account page.
|
||||
- Check out which flags you can toggle: `itch-dl --help`
|
||||
- Run the downloader: `itch-dl --api-key <KEY> https://itch.io/jam/yourjamhere`
|
||||
- Wait. This is going to take a while.
|
||||
|
||||
@ -21,12 +22,13 @@ game jam. The input can also be a path to a itch.io JSON file with game jam entr
|
||||
a list of itch.io game URLs (not browse/jam pages!) to download.
|
||||
|
||||
**It's expected that the downloader output will not be complete** - logs are stupidly verbose
|
||||
and it prints a report on successful/failed downloads, so you must manually grab whatever was
|
||||
not handled for you automatically for some reason.
|
||||
and it prints a report on failed downloads and external URLs (links to files that are not on
|
||||
itch.io itself, but rather on an external host like Google Drive, Dropbox, etc), so you must
|
||||
manually grab whatever was not handled for you automatically.
|
||||
|
||||
The downloader also grabs the entry page HTML, which usually comes with controls and such. By
|
||||
default, it does not download images, assets and so on, just the text - use `--mirror-web` to
|
||||
try and download these as well. This requires `wget` to be available in your `PATH`.
|
||||
try and download these as well. This does not work very well yet, but gets the basics done.
|
||||
|
||||
|
||||
## Game Jam Entries JSON
|
||||
|
@ -1 +1 @@
|
||||
__version__ = '0.1.0'
|
||||
__version__ = '0.2.0'
|
||||
|
@ -2,7 +2,7 @@ ITCH_BASE = "itch.io"
|
||||
ITCH_URL = f"https://{ITCH_BASE}"
|
||||
ITCH_API = f"https://api.{ITCH_BASE}"
|
||||
|
||||
# Extracts https://user.itch.io/gamename to {'author': 'user', 'game': 'gamename'}
|
||||
# Extracts https://user.itch.io/name to {'author': 'user', 'game': 'name'}
|
||||
ITCH_GAME_URL_REGEX = r"^https:\/\/(?P<author>[\w\d\-_]+).itch.io\/(?P<game>[\w\d\-_]+)$"
|
||||
|
||||
ITCH_BROWSER_TYPES = [
|
||||
|
@ -2,7 +2,8 @@ import os
|
||||
import json
|
||||
import re
|
||||
import logging
|
||||
from typing import Tuple, List, Dict, TypedDict, Optional
|
||||
import urllib.parse
|
||||
from typing import List, Dict, TypedDict, Optional, Union
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from requests.exceptions import HTTPError
|
||||
@ -13,7 +14,7 @@ from tqdm.contrib.concurrent import thread_map
|
||||
from .api import ItchApiClient
|
||||
from .utils import ItchDownloadError, get_int_after_marker_in_json
|
||||
from .consts import ITCH_GAME_URL_REGEX
|
||||
from .infobox import parse_infobox
|
||||
from .infobox import parse_infobox, InfoboxMetadata
|
||||
|
||||
TARGET_PATHS = {
|
||||
'site': 'site.html',
|
||||
@ -25,11 +26,11 @@ TARGET_PATHS = {
|
||||
|
||||
|
||||
class DownloadResult:
|
||||
def __init__(self, url: str, success: bool, errors, external_urls: Optional[List[str]] = None):
|
||||
def __init__(self, url: str, success: bool, errors, external_urls: List[str]):
|
||||
self.url = url
|
||||
self.success = success
|
||||
self.errors = errors
|
||||
self.external_urls = external_urls
|
||||
self.errors = errors or []
|
||||
self.external_urls = external_urls or []
|
||||
|
||||
|
||||
class GameMetadata(TypedDict, total=False):
|
||||
@ -47,7 +48,11 @@ class GameMetadata(TypedDict, total=False):
|
||||
screenshots: List[str]
|
||||
description: str
|
||||
|
||||
rating: Dict[str, Union[float, int]]
|
||||
extra: InfoboxMetadata
|
||||
|
||||
created_at: str
|
||||
updated_at: str
|
||||
released_at: str
|
||||
published_at: str
|
||||
|
||||
@ -60,7 +65,8 @@ class GameDownloader:
|
||||
self.download_keys = keys
|
||||
self.client = ItchApiClient(api_key)
|
||||
|
||||
def get_rating_json(self, site) -> Optional[dict]:
|
||||
@staticmethod
|
||||
def get_rating_json(site) -> Optional[dict]:
|
||||
for ldjson_node in site.find_all("script", type="application/ld+json"):
|
||||
try:
|
||||
ldjson: dict = json.loads(ldjson_node.text.strip())
|
||||
@ -71,7 +77,8 @@ class GameDownloader:
|
||||
|
||||
return None
|
||||
|
||||
def get_meta(self, site, **kwargs) -> Optional[str]:
|
||||
@staticmethod
|
||||
def get_meta(site, **kwargs) -> Optional[str]:
|
||||
"""Grabs <meta property="xyz" content="value"/> values."""
|
||||
node = site.find("meta", attrs=kwargs)
|
||||
if not node:
|
||||
@ -140,8 +147,34 @@ class GameDownloader:
|
||||
infobox_div = site.find("div", class_="game_info_panel_widget")
|
||||
if infobox_div:
|
||||
infobox = parse_infobox(infobox_div)
|
||||
for dt in ('created_at', 'updated_at', 'released_at', 'published_at'):
|
||||
if dt in infobox:
|
||||
# noinspection PyTypedDict
|
||||
metadata[dt] = infobox[dt].isoformat()
|
||||
del infobox[dt]
|
||||
|
||||
TODO_KEYS = ['author', 'author_url', 'created_at', 'released_at', 'published_at']
|
||||
if 'author' in infobox:
|
||||
metadata['author'] = infobox['author']['author']
|
||||
metadata['author_url'] = infobox['author']['author_url']
|
||||
del infobox['author']
|
||||
|
||||
if 'authors' in infobox and 'author' not in metadata:
|
||||
# Some games may have multiple authors (ex. compilations).
|
||||
metadata['author'] = "Multiple authors"
|
||||
metadata['author_url'] = f"https://{urllib.parse.urlparse(url).netloc}"
|
||||
|
||||
metadata['extra'] = infobox
|
||||
|
||||
agg_rating = rating_json.get('aggregateRating')
|
||||
if agg_rating:
|
||||
try:
|
||||
metadata['rating'] = {
|
||||
'average': float(agg_rating['ratingValue']),
|
||||
'votes': agg_rating['ratingCount']
|
||||
}
|
||||
except: # noqa
|
||||
logging.exception("Could not extract the rating metadata...")
|
||||
pass # Nope, just, don't
|
||||
|
||||
return metadata
|
||||
|
||||
@ -179,7 +212,7 @@ class GameDownloader:
|
||||
def download(self, url: str, skip_downloaded: bool = True):
|
||||
match = re.match(ITCH_GAME_URL_REGEX, url)
|
||||
if not match:
|
||||
return DownloadResult(url, False, [f"Game URL is invalid: {url} - please file a new issue."])
|
||||
return DownloadResult(url, False, [f"Game URL is invalid: {url} - please file a new issue."], [])
|
||||
|
||||
author, game = match['author'], match['game']
|
||||
|
||||
@ -192,14 +225,14 @@ class GameDownloader:
|
||||
# As metadata is the final file we write, all the files
|
||||
# should already be downloaded at this point.
|
||||
logging.info("Skipping already-downloaded game for URL: %s", url)
|
||||
return DownloadResult(url, True, [f"Game already downloaded."])
|
||||
return DownloadResult(url, True, [f"Game already downloaded."], [])
|
||||
|
||||
try:
|
||||
logging.info("Downloading %s", url)
|
||||
r = self.client.get(url, append_api_key=False)
|
||||
r.raise_for_status()
|
||||
except Exception as e:
|
||||
return DownloadResult(url, False, [f"Could not download the game site for {url}: {e}"])
|
||||
return DownloadResult(url, False, [f"Could not download the game site for {url}: {e}"], [])
|
||||
|
||||
site = BeautifulSoup(r.text, features="lxml")
|
||||
try:
|
||||
@ -207,14 +240,14 @@ class GameDownloader:
|
||||
metadata = self.extract_metadata(game_id, url, site)
|
||||
title = metadata['title'] or game
|
||||
except ItchDownloadError as e:
|
||||
return DownloadResult(url, False, [str(e)])
|
||||
return DownloadResult(url, False, [str(e)], [])
|
||||
|
||||
credentials = self.get_credentials(title, game_id)
|
||||
try:
|
||||
game_uploads_req = self.client.get(f"/games/{game_id}/uploads", data=credentials, timeout=15)
|
||||
game_uploads_req.raise_for_status()
|
||||
except Exception as e:
|
||||
return DownloadResult(url, False, [f"Could not fetch game uploads for {title}: {e}"])
|
||||
return DownloadResult(url, False, [f"Could not fetch game uploads for {title}: {e}"], [])
|
||||
|
||||
game_uploads = game_uploads_req.json()['uploads']
|
||||
logging.debug("Found %d upload(s): %s", len(game_uploads), str(game_uploads))
|
||||
@ -264,17 +297,20 @@ class GameDownloader:
|
||||
if len(external_urls) > 0:
|
||||
logging.warning(f"Game {title} has external download URLs: {external_urls}")
|
||||
|
||||
# TODO: Screenshots and site assets
|
||||
# TODO: Mirror JS/CSS assets
|
||||
if self.mirror_web:
|
||||
os.makedirs(paths['screenshots'], exist_ok=True)
|
||||
for screenshot in metadata['screenshots']:
|
||||
if not screenshot:
|
||||
continue
|
||||
|
||||
file_name = os.path.basename(screenshot)
|
||||
try:
|
||||
self.download_file(screenshot, os.path.join(paths['screenshots'], file_name), credentials={})
|
||||
except Exception as e:
|
||||
errors.append(f"Screenshot download failed (this is not fatal): {e}")
|
||||
|
||||
if 'cover_url' in metadata:
|
||||
if metadata.get('cover_url'):
|
||||
try:
|
||||
cover_url = metadata['cover_url']
|
||||
self.download_file(cover_url, paths['cover'] + os.path.splitext(cover_url)[-1], credentials={})
|
||||
@ -285,7 +321,7 @@ class GameDownloader:
|
||||
f.write(site.prettify())
|
||||
|
||||
with open(paths['metadata'], 'w') as f:
|
||||
json.dump(metadata, f)
|
||||
json.dump(metadata, f, indent=4)
|
||||
|
||||
if len(errors) > 0:
|
||||
logging.error(f"Game {title} has download errors: {errors}")
|
||||
@ -294,7 +330,14 @@ class GameDownloader:
|
||||
return DownloadResult(url, len(errors) == 0, errors, external_urls)
|
||||
|
||||
|
||||
def drive_downloads(jobs: List[str], download_to: str, mirror_web: bool, api_key: str, keys: Dict[int, str], parallel: int = 1):
|
||||
def drive_downloads(
|
||||
jobs: List[str],
|
||||
download_to: str,
|
||||
mirror_web: bool,
|
||||
api_key: str,
|
||||
keys: Dict[int, str],
|
||||
parallel: int = 1
|
||||
):
|
||||
downloader = GameDownloader(download_to, mirror_web, api_key, keys)
|
||||
tqdm_args = {
|
||||
"desc": "Games",
|
||||
|
@ -5,17 +5,43 @@ from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
class InfoboxMetadata(TypedDict, total=False):
|
||||
pass
|
||||
updated_at: datetime
|
||||
released_at: datetime
|
||||
published_at: datetime
|
||||
status: str
|
||||
platforms: List[str] # Windows/macOS/Linux/etc
|
||||
publisher: str
|
||||
author: Dict[str, str] # See impl below!
|
||||
authors: Dict[str, str] # Links
|
||||
genre: Dict[str, str] # Links
|
||||
tools: Dict[str, str] # Links
|
||||
license: Dict[str, str] # Links
|
||||
asset_license: Dict[str, str] # Links
|
||||
tags: Dict[str, str] # Links
|
||||
length: str
|
||||
multiplayer: Dict[str, str] # Links
|
||||
player_count: str
|
||||
accessibility: Dict[str, str] # Links
|
||||
inputs: Dict[str, str] # Links
|
||||
links: Dict[str, str] # Links
|
||||
mentions: Dict[str, str] # Links
|
||||
|
||||
|
||||
def parse_date_block(td: BeautifulSoup) -> datetime:
|
||||
raise NotImplementedError("Not yet!")
|
||||
def parse_date_block(td: BeautifulSoup) -> Optional[datetime]:
|
||||
abbr = td.find("abbr")
|
||||
if not abbr or 'title' not in abbr.attrs:
|
||||
return None
|
||||
|
||||
date_str, time_str = abbr['title'].split('@')
|
||||
date = datetime.strptime(date_str.strip(), "%d %B %Y")
|
||||
time = datetime.strptime(time_str.strip(), "%H:%M")
|
||||
return datetime(date.year, date.month, date.day, time.hour, time.minute)
|
||||
|
||||
|
||||
def parse_links(td: BeautifulSoup) -> Dict[str, str]:
|
||||
"""Parses blocks of comma-separated <a> blocks, returns a dict
|
||||
of link text -> URL it points at."""
|
||||
pass
|
||||
return {link.text.strip(): link['href'] for link in td.find_all("a")}
|
||||
|
||||
|
||||
def parse_text_from_links(td: BeautifulSoup) -> List[str]:
|
||||
@ -24,7 +50,54 @@ def parse_text_from_links(td: BeautifulSoup) -> List[str]:
|
||||
|
||||
def parse_tr(name: str, content: BeautifulSoup) -> Optional[Tuple[str, Any]]:
|
||||
if name == "Updated":
|
||||
pass
|
||||
return "updated_at", parse_date_block(content)
|
||||
elif name == "Release date":
|
||||
return "released_at", parse_date_block(content)
|
||||
elif name == "Published":
|
||||
return "published_at", parse_date_block(content)
|
||||
elif name == "Status":
|
||||
return "status", parse_text_from_links(content)[0]
|
||||
elif name == "Platforms":
|
||||
return "platforms", parse_text_from_links(content)
|
||||
elif name == "Publisher":
|
||||
return "publisher", content.text.strip()
|
||||
elif name == "Rating":
|
||||
return None # Read the AggregatedRating block instead!
|
||||
elif name == "Author":
|
||||
author, author_url = parse_links(content).popitem()
|
||||
return "author", {"author": author, "author_url": author_url}
|
||||
elif name == "Authors":
|
||||
return "authors", parse_links(content)
|
||||
elif name == "Genre":
|
||||
return "genre", parse_links(content)
|
||||
elif name == "Made with":
|
||||
return "tools", parse_links(content)
|
||||
elif name == "License":
|
||||
return "license", parse_links(content)
|
||||
elif name == "Asset license":
|
||||
return "asset_license", parse_links(content)
|
||||
elif name == "Tags":
|
||||
return "tags", parse_links(content)
|
||||
elif name == "Average session":
|
||||
return "length", parse_text_from_links(content)[0]
|
||||
elif name == "Languages":
|
||||
return "languages", parse_links(content)
|
||||
elif name == "Multiplayer":
|
||||
return "multiplayer", parse_links(content)
|
||||
elif name == "Player count":
|
||||
return "player_count", content.text.strip()
|
||||
elif name == "Accessibility":
|
||||
return "accessibility", parse_links(content)
|
||||
elif name == "Inputs":
|
||||
return "inputs", parse_links(content)
|
||||
elif name == "Links":
|
||||
return "links", parse_links(content)
|
||||
elif name == "Mentions":
|
||||
return "mentions", parse_links(content)
|
||||
else:
|
||||
# Oops, you need to extend this with something new. Sorry.
|
||||
# Make sure to add the block name to InfoboxMetadata as well!
|
||||
raise NotImplementedError(f"Unknown infobox block name '{name}' - please file a new itch-dl issue.")
|
||||
|
||||
|
||||
def parse_infobox(infobox: BeautifulSoup) -> dict:
|
||||
@ -42,6 +115,7 @@ def parse_infobox(infobox: BeautifulSoup) -> dict:
|
||||
|
||||
parsed_block = parse_tr(name, content_td)
|
||||
if parsed_block:
|
||||
# noinspection PyTypedDict
|
||||
meta[parsed_block[0]] = parsed_block[1]
|
||||
|
||||
return meta
|
||||
|
Loading…
Reference in New Issue
Block a user