mirror of
https://github.com/DragoonAethis/itch-dl.git
synced 2024-12-21 02:21:52 +01:00
Implement infobox parsing, misc bugfixes, version bump
This commit is contained in:
parent
f5c0f4658d
commit
008e6870e8
14
README.md
14
README.md
@ -5,12 +5,13 @@ Bulk download games from [itch.io](https://itch.io/).
|
|||||||
- Can download game jams, browse pages (popular, newest, browse by tag...) and individual games.
|
- Can download game jams, browse pages (popular, newest, browse by tag...) and individual games.
|
||||||
- Requires Python 3.8+, grab it from PyPI: `pip install itch-dl`
|
- Requires Python 3.8+, grab it from PyPI: `pip install itch-dl`
|
||||||
- For development, use [Poetry](https://python-poetry.org/).
|
- For development, use [Poetry](https://python-poetry.org/).
|
||||||
- Optionally requires wget for site mirroring.
|
|
||||||
|
|
||||||
How to use this:
|
|
||||||
|
## How to use
|
||||||
|
|
||||||
- Log into itch.io with the account you'd like to use for downloading.
|
- Log into itch.io with the account you'd like to use for downloading.
|
||||||
- Generate a new API key on your user account page: https://itch.io/user/settings/api-keys
|
- Generate [a new API key](https://itch.io/user/settings/api-keys) on your user account page.
|
||||||
|
- Check out which flags you can toggle: `itch-dl --help`
|
||||||
- Run the downloader: `itch-dl --api-key <KEY> https://itch.io/jam/yourjamhere`
|
- Run the downloader: `itch-dl --api-key <KEY> https://itch.io/jam/yourjamhere`
|
||||||
- Wait. This is going to take a while.
|
- Wait. This is going to take a while.
|
||||||
|
|
||||||
@ -21,12 +22,13 @@ game jam. The input can also be a path to a itch.io JSON file with game jam entr
|
|||||||
a list of itch.io game URLs (not browse/jam pages!) to download.
|
a list of itch.io game URLs (not browse/jam pages!) to download.
|
||||||
|
|
||||||
**It's expected that the downloader output will not be complete** - logs are stupidly verbose
|
**It's expected that the downloader output will not be complete** - logs are stupidly verbose
|
||||||
and it prints a report on successful/failed downloads, so you must manually grab whatever was
|
and it prints a report on failed downloads and external URLs (links to files that are not on
|
||||||
not handled for you automatically for some reason.
|
itch.io itself, but rather on an external host like Google Drive, Dropbox, etc), so you must
|
||||||
|
manually grab whatever was not handled for you automatically.
|
||||||
|
|
||||||
The downloader also grabs the entry page HTML, which usually comes with controls and such. By
|
The downloader also grabs the entry page HTML, which usually comes with controls and such. By
|
||||||
default, it does not download images, assets and so on, just the text - use `--mirror-web` to
|
default, it does not download images, assets and so on, just the text - use `--mirror-web` to
|
||||||
try and download these as well. This requires `wget` to be available in your `PATH`.
|
try and download these as well. This does not work very well yet, but gets the basics done.
|
||||||
|
|
||||||
|
|
||||||
## Game Jam Entries JSON
|
## Game Jam Entries JSON
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = '0.1.0'
|
__version__ = '0.2.0'
|
||||||
|
@ -2,7 +2,7 @@ ITCH_BASE = "itch.io"
|
|||||||
ITCH_URL = f"https://{ITCH_BASE}"
|
ITCH_URL = f"https://{ITCH_BASE}"
|
||||||
ITCH_API = f"https://api.{ITCH_BASE}"
|
ITCH_API = f"https://api.{ITCH_BASE}"
|
||||||
|
|
||||||
# Extracts https://user.itch.io/gamename to {'author': 'user', 'game': 'gamename'}
|
# Extracts https://user.itch.io/name to {'author': 'user', 'game': 'name'}
|
||||||
ITCH_GAME_URL_REGEX = r"^https:\/\/(?P<author>[\w\d\-_]+).itch.io\/(?P<game>[\w\d\-_]+)$"
|
ITCH_GAME_URL_REGEX = r"^https:\/\/(?P<author>[\w\d\-_]+).itch.io\/(?P<game>[\w\d\-_]+)$"
|
||||||
|
|
||||||
ITCH_BROWSER_TYPES = [
|
ITCH_BROWSER_TYPES = [
|
||||||
|
@ -2,7 +2,8 @@ import os
|
|||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
from typing import Tuple, List, Dict, TypedDict, Optional
|
import urllib.parse
|
||||||
|
from typing import List, Dict, TypedDict, Optional, Union
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from requests.exceptions import HTTPError
|
from requests.exceptions import HTTPError
|
||||||
@ -13,7 +14,7 @@ from tqdm.contrib.concurrent import thread_map
|
|||||||
from .api import ItchApiClient
|
from .api import ItchApiClient
|
||||||
from .utils import ItchDownloadError, get_int_after_marker_in_json
|
from .utils import ItchDownloadError, get_int_after_marker_in_json
|
||||||
from .consts import ITCH_GAME_URL_REGEX
|
from .consts import ITCH_GAME_URL_REGEX
|
||||||
from .infobox import parse_infobox
|
from .infobox import parse_infobox, InfoboxMetadata
|
||||||
|
|
||||||
TARGET_PATHS = {
|
TARGET_PATHS = {
|
||||||
'site': 'site.html',
|
'site': 'site.html',
|
||||||
@ -25,11 +26,11 @@ TARGET_PATHS = {
|
|||||||
|
|
||||||
|
|
||||||
class DownloadResult:
|
class DownloadResult:
|
||||||
def __init__(self, url: str, success: bool, errors, external_urls: Optional[List[str]] = None):
|
def __init__(self, url: str, success: bool, errors, external_urls: List[str]):
|
||||||
self.url = url
|
self.url = url
|
||||||
self.success = success
|
self.success = success
|
||||||
self.errors = errors
|
self.errors = errors or []
|
||||||
self.external_urls = external_urls
|
self.external_urls = external_urls or []
|
||||||
|
|
||||||
|
|
||||||
class GameMetadata(TypedDict, total=False):
|
class GameMetadata(TypedDict, total=False):
|
||||||
@ -47,7 +48,11 @@ class GameMetadata(TypedDict, total=False):
|
|||||||
screenshots: List[str]
|
screenshots: List[str]
|
||||||
description: str
|
description: str
|
||||||
|
|
||||||
|
rating: Dict[str, Union[float, int]]
|
||||||
|
extra: InfoboxMetadata
|
||||||
|
|
||||||
created_at: str
|
created_at: str
|
||||||
|
updated_at: str
|
||||||
released_at: str
|
released_at: str
|
||||||
published_at: str
|
published_at: str
|
||||||
|
|
||||||
@ -60,7 +65,8 @@ class GameDownloader:
|
|||||||
self.download_keys = keys
|
self.download_keys = keys
|
||||||
self.client = ItchApiClient(api_key)
|
self.client = ItchApiClient(api_key)
|
||||||
|
|
||||||
def get_rating_json(self, site) -> Optional[dict]:
|
@staticmethod
|
||||||
|
def get_rating_json(site) -> Optional[dict]:
|
||||||
for ldjson_node in site.find_all("script", type="application/ld+json"):
|
for ldjson_node in site.find_all("script", type="application/ld+json"):
|
||||||
try:
|
try:
|
||||||
ldjson: dict = json.loads(ldjson_node.text.strip())
|
ldjson: dict = json.loads(ldjson_node.text.strip())
|
||||||
@ -71,7 +77,8 @@ class GameDownloader:
|
|||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_meta(self, site, **kwargs) -> Optional[str]:
|
@staticmethod
|
||||||
|
def get_meta(site, **kwargs) -> Optional[str]:
|
||||||
"""Grabs <meta property="xyz" content="value"/> values."""
|
"""Grabs <meta property="xyz" content="value"/> values."""
|
||||||
node = site.find("meta", attrs=kwargs)
|
node = site.find("meta", attrs=kwargs)
|
||||||
if not node:
|
if not node:
|
||||||
@ -140,8 +147,34 @@ class GameDownloader:
|
|||||||
infobox_div = site.find("div", class_="game_info_panel_widget")
|
infobox_div = site.find("div", class_="game_info_panel_widget")
|
||||||
if infobox_div:
|
if infobox_div:
|
||||||
infobox = parse_infobox(infobox_div)
|
infobox = parse_infobox(infobox_div)
|
||||||
|
for dt in ('created_at', 'updated_at', 'released_at', 'published_at'):
|
||||||
|
if dt in infobox:
|
||||||
|
# noinspection PyTypedDict
|
||||||
|
metadata[dt] = infobox[dt].isoformat()
|
||||||
|
del infobox[dt]
|
||||||
|
|
||||||
TODO_KEYS = ['author', 'author_url', 'created_at', 'released_at', 'published_at']
|
if 'author' in infobox:
|
||||||
|
metadata['author'] = infobox['author']['author']
|
||||||
|
metadata['author_url'] = infobox['author']['author_url']
|
||||||
|
del infobox['author']
|
||||||
|
|
||||||
|
if 'authors' in infobox and 'author' not in metadata:
|
||||||
|
# Some games may have multiple authors (ex. compilations).
|
||||||
|
metadata['author'] = "Multiple authors"
|
||||||
|
metadata['author_url'] = f"https://{urllib.parse.urlparse(url).netloc}"
|
||||||
|
|
||||||
|
metadata['extra'] = infobox
|
||||||
|
|
||||||
|
agg_rating = rating_json.get('aggregateRating')
|
||||||
|
if agg_rating:
|
||||||
|
try:
|
||||||
|
metadata['rating'] = {
|
||||||
|
'average': float(agg_rating['ratingValue']),
|
||||||
|
'votes': agg_rating['ratingCount']
|
||||||
|
}
|
||||||
|
except: # noqa
|
||||||
|
logging.exception("Could not extract the rating metadata...")
|
||||||
|
pass # Nope, just, don't
|
||||||
|
|
||||||
return metadata
|
return metadata
|
||||||
|
|
||||||
@ -179,7 +212,7 @@ class GameDownloader:
|
|||||||
def download(self, url: str, skip_downloaded: bool = True):
|
def download(self, url: str, skip_downloaded: bool = True):
|
||||||
match = re.match(ITCH_GAME_URL_REGEX, url)
|
match = re.match(ITCH_GAME_URL_REGEX, url)
|
||||||
if not match:
|
if not match:
|
||||||
return DownloadResult(url, False, [f"Game URL is invalid: {url} - please file a new issue."])
|
return DownloadResult(url, False, [f"Game URL is invalid: {url} - please file a new issue."], [])
|
||||||
|
|
||||||
author, game = match['author'], match['game']
|
author, game = match['author'], match['game']
|
||||||
|
|
||||||
@ -192,14 +225,14 @@ class GameDownloader:
|
|||||||
# As metadata is the final file we write, all the files
|
# As metadata is the final file we write, all the files
|
||||||
# should already be downloaded at this point.
|
# should already be downloaded at this point.
|
||||||
logging.info("Skipping already-downloaded game for URL: %s", url)
|
logging.info("Skipping already-downloaded game for URL: %s", url)
|
||||||
return DownloadResult(url, True, [f"Game already downloaded."])
|
return DownloadResult(url, True, [f"Game already downloaded."], [])
|
||||||
|
|
||||||
try:
|
try:
|
||||||
logging.info("Downloading %s", url)
|
logging.info("Downloading %s", url)
|
||||||
r = self.client.get(url, append_api_key=False)
|
r = self.client.get(url, append_api_key=False)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return DownloadResult(url, False, [f"Could not download the game site for {url}: {e}"])
|
return DownloadResult(url, False, [f"Could not download the game site for {url}: {e}"], [])
|
||||||
|
|
||||||
site = BeautifulSoup(r.text, features="lxml")
|
site = BeautifulSoup(r.text, features="lxml")
|
||||||
try:
|
try:
|
||||||
@ -207,14 +240,14 @@ class GameDownloader:
|
|||||||
metadata = self.extract_metadata(game_id, url, site)
|
metadata = self.extract_metadata(game_id, url, site)
|
||||||
title = metadata['title'] or game
|
title = metadata['title'] or game
|
||||||
except ItchDownloadError as e:
|
except ItchDownloadError as e:
|
||||||
return DownloadResult(url, False, [str(e)])
|
return DownloadResult(url, False, [str(e)], [])
|
||||||
|
|
||||||
credentials = self.get_credentials(title, game_id)
|
credentials = self.get_credentials(title, game_id)
|
||||||
try:
|
try:
|
||||||
game_uploads_req = self.client.get(f"/games/{game_id}/uploads", data=credentials, timeout=15)
|
game_uploads_req = self.client.get(f"/games/{game_id}/uploads", data=credentials, timeout=15)
|
||||||
game_uploads_req.raise_for_status()
|
game_uploads_req.raise_for_status()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return DownloadResult(url, False, [f"Could not fetch game uploads for {title}: {e}"])
|
return DownloadResult(url, False, [f"Could not fetch game uploads for {title}: {e}"], [])
|
||||||
|
|
||||||
game_uploads = game_uploads_req.json()['uploads']
|
game_uploads = game_uploads_req.json()['uploads']
|
||||||
logging.debug("Found %d upload(s): %s", len(game_uploads), str(game_uploads))
|
logging.debug("Found %d upload(s): %s", len(game_uploads), str(game_uploads))
|
||||||
@ -264,17 +297,20 @@ class GameDownloader:
|
|||||||
if len(external_urls) > 0:
|
if len(external_urls) > 0:
|
||||||
logging.warning(f"Game {title} has external download URLs: {external_urls}")
|
logging.warning(f"Game {title} has external download URLs: {external_urls}")
|
||||||
|
|
||||||
# TODO: Screenshots and site assets
|
# TODO: Mirror JS/CSS assets
|
||||||
if self.mirror_web:
|
if self.mirror_web:
|
||||||
os.makedirs(paths['screenshots'], exist_ok=True)
|
os.makedirs(paths['screenshots'], exist_ok=True)
|
||||||
for screenshot in metadata['screenshots']:
|
for screenshot in metadata['screenshots']:
|
||||||
|
if not screenshot:
|
||||||
|
continue
|
||||||
|
|
||||||
file_name = os.path.basename(screenshot)
|
file_name = os.path.basename(screenshot)
|
||||||
try:
|
try:
|
||||||
self.download_file(screenshot, os.path.join(paths['screenshots'], file_name), credentials={})
|
self.download_file(screenshot, os.path.join(paths['screenshots'], file_name), credentials={})
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
errors.append(f"Screenshot download failed (this is not fatal): {e}")
|
errors.append(f"Screenshot download failed (this is not fatal): {e}")
|
||||||
|
|
||||||
if 'cover_url' in metadata:
|
if metadata.get('cover_url'):
|
||||||
try:
|
try:
|
||||||
cover_url = metadata['cover_url']
|
cover_url = metadata['cover_url']
|
||||||
self.download_file(cover_url, paths['cover'] + os.path.splitext(cover_url)[-1], credentials={})
|
self.download_file(cover_url, paths['cover'] + os.path.splitext(cover_url)[-1], credentials={})
|
||||||
@ -285,7 +321,7 @@ class GameDownloader:
|
|||||||
f.write(site.prettify())
|
f.write(site.prettify())
|
||||||
|
|
||||||
with open(paths['metadata'], 'w') as f:
|
with open(paths['metadata'], 'w') as f:
|
||||||
json.dump(metadata, f)
|
json.dump(metadata, f, indent=4)
|
||||||
|
|
||||||
if len(errors) > 0:
|
if len(errors) > 0:
|
||||||
logging.error(f"Game {title} has download errors: {errors}")
|
logging.error(f"Game {title} has download errors: {errors}")
|
||||||
@ -294,7 +330,14 @@ class GameDownloader:
|
|||||||
return DownloadResult(url, len(errors) == 0, errors, external_urls)
|
return DownloadResult(url, len(errors) == 0, errors, external_urls)
|
||||||
|
|
||||||
|
|
||||||
def drive_downloads(jobs: List[str], download_to: str, mirror_web: bool, api_key: str, keys: Dict[int, str], parallel: int = 1):
|
def drive_downloads(
|
||||||
|
jobs: List[str],
|
||||||
|
download_to: str,
|
||||||
|
mirror_web: bool,
|
||||||
|
api_key: str,
|
||||||
|
keys: Dict[int, str],
|
||||||
|
parallel: int = 1
|
||||||
|
):
|
||||||
downloader = GameDownloader(download_to, mirror_web, api_key, keys)
|
downloader = GameDownloader(download_to, mirror_web, api_key, keys)
|
||||||
tqdm_args = {
|
tqdm_args = {
|
||||||
"desc": "Games",
|
"desc": "Games",
|
||||||
|
@ -5,17 +5,43 @@ from bs4 import BeautifulSoup
|
|||||||
|
|
||||||
|
|
||||||
class InfoboxMetadata(TypedDict, total=False):
|
class InfoboxMetadata(TypedDict, total=False):
|
||||||
pass
|
updated_at: datetime
|
||||||
|
released_at: datetime
|
||||||
|
published_at: datetime
|
||||||
|
status: str
|
||||||
|
platforms: List[str] # Windows/macOS/Linux/etc
|
||||||
|
publisher: str
|
||||||
|
author: Dict[str, str] # See impl below!
|
||||||
|
authors: Dict[str, str] # Links
|
||||||
|
genre: Dict[str, str] # Links
|
||||||
|
tools: Dict[str, str] # Links
|
||||||
|
license: Dict[str, str] # Links
|
||||||
|
asset_license: Dict[str, str] # Links
|
||||||
|
tags: Dict[str, str] # Links
|
||||||
|
length: str
|
||||||
|
multiplayer: Dict[str, str] # Links
|
||||||
|
player_count: str
|
||||||
|
accessibility: Dict[str, str] # Links
|
||||||
|
inputs: Dict[str, str] # Links
|
||||||
|
links: Dict[str, str] # Links
|
||||||
|
mentions: Dict[str, str] # Links
|
||||||
|
|
||||||
|
|
||||||
def parse_date_block(td: BeautifulSoup) -> datetime:
|
def parse_date_block(td: BeautifulSoup) -> Optional[datetime]:
|
||||||
raise NotImplementedError("Not yet!")
|
abbr = td.find("abbr")
|
||||||
|
if not abbr or 'title' not in abbr.attrs:
|
||||||
|
return None
|
||||||
|
|
||||||
|
date_str, time_str = abbr['title'].split('@')
|
||||||
|
date = datetime.strptime(date_str.strip(), "%d %B %Y")
|
||||||
|
time = datetime.strptime(time_str.strip(), "%H:%M")
|
||||||
|
return datetime(date.year, date.month, date.day, time.hour, time.minute)
|
||||||
|
|
||||||
|
|
||||||
def parse_links(td: BeautifulSoup) -> Dict[str, str]:
|
def parse_links(td: BeautifulSoup) -> Dict[str, str]:
|
||||||
"""Parses blocks of comma-separated <a> blocks, returns a dict
|
"""Parses blocks of comma-separated <a> blocks, returns a dict
|
||||||
of link text -> URL it points at."""
|
of link text -> URL it points at."""
|
||||||
pass
|
return {link.text.strip(): link['href'] for link in td.find_all("a")}
|
||||||
|
|
||||||
|
|
||||||
def parse_text_from_links(td: BeautifulSoup) -> List[str]:
|
def parse_text_from_links(td: BeautifulSoup) -> List[str]:
|
||||||
@ -24,7 +50,54 @@ def parse_text_from_links(td: BeautifulSoup) -> List[str]:
|
|||||||
|
|
||||||
def parse_tr(name: str, content: BeautifulSoup) -> Optional[Tuple[str, Any]]:
|
def parse_tr(name: str, content: BeautifulSoup) -> Optional[Tuple[str, Any]]:
|
||||||
if name == "Updated":
|
if name == "Updated":
|
||||||
pass
|
return "updated_at", parse_date_block(content)
|
||||||
|
elif name == "Release date":
|
||||||
|
return "released_at", parse_date_block(content)
|
||||||
|
elif name == "Published":
|
||||||
|
return "published_at", parse_date_block(content)
|
||||||
|
elif name == "Status":
|
||||||
|
return "status", parse_text_from_links(content)[0]
|
||||||
|
elif name == "Platforms":
|
||||||
|
return "platforms", parse_text_from_links(content)
|
||||||
|
elif name == "Publisher":
|
||||||
|
return "publisher", content.text.strip()
|
||||||
|
elif name == "Rating":
|
||||||
|
return None # Read the AggregatedRating block instead!
|
||||||
|
elif name == "Author":
|
||||||
|
author, author_url = parse_links(content).popitem()
|
||||||
|
return "author", {"author": author, "author_url": author_url}
|
||||||
|
elif name == "Authors":
|
||||||
|
return "authors", parse_links(content)
|
||||||
|
elif name == "Genre":
|
||||||
|
return "genre", parse_links(content)
|
||||||
|
elif name == "Made with":
|
||||||
|
return "tools", parse_links(content)
|
||||||
|
elif name == "License":
|
||||||
|
return "license", parse_links(content)
|
||||||
|
elif name == "Asset license":
|
||||||
|
return "asset_license", parse_links(content)
|
||||||
|
elif name == "Tags":
|
||||||
|
return "tags", parse_links(content)
|
||||||
|
elif name == "Average session":
|
||||||
|
return "length", parse_text_from_links(content)[0]
|
||||||
|
elif name == "Languages":
|
||||||
|
return "languages", parse_links(content)
|
||||||
|
elif name == "Multiplayer":
|
||||||
|
return "multiplayer", parse_links(content)
|
||||||
|
elif name == "Player count":
|
||||||
|
return "player_count", content.text.strip()
|
||||||
|
elif name == "Accessibility":
|
||||||
|
return "accessibility", parse_links(content)
|
||||||
|
elif name == "Inputs":
|
||||||
|
return "inputs", parse_links(content)
|
||||||
|
elif name == "Links":
|
||||||
|
return "links", parse_links(content)
|
||||||
|
elif name == "Mentions":
|
||||||
|
return "mentions", parse_links(content)
|
||||||
|
else:
|
||||||
|
# Oops, you need to extend this with something new. Sorry.
|
||||||
|
# Make sure to add the block name to InfoboxMetadata as well!
|
||||||
|
raise NotImplementedError(f"Unknown infobox block name '{name}' - please file a new itch-dl issue.")
|
||||||
|
|
||||||
|
|
||||||
def parse_infobox(infobox: BeautifulSoup) -> dict:
|
def parse_infobox(infobox: BeautifulSoup) -> dict:
|
||||||
@ -42,6 +115,7 @@ def parse_infobox(infobox: BeautifulSoup) -> dict:
|
|||||||
|
|
||||||
parsed_block = parse_tr(name, content_td)
|
parsed_block = parse_tr(name, content_td)
|
||||||
if parsed_block:
|
if parsed_block:
|
||||||
|
# noinspection PyTypedDict
|
||||||
meta[parsed_block[0]] = parsed_block[1]
|
meta[parsed_block[0]] = parsed_block[1]
|
||||||
|
|
||||||
return meta
|
return meta
|
||||||
|
Loading…
Reference in New Issue
Block a user