1
0
forked from Mirrors/itch-dl

Implement infobox parsing, misc bugfixes, version bump

This commit is contained in:
Ryszard Knop
2022-05-15 20:10:32 +02:00
parent f5c0f4658d
commit 008e6870e8
5 changed files with 149 additions and 30 deletions

View File

@@ -2,7 +2,8 @@ import os
import json
import re
import logging
from typing import Tuple, List, Dict, TypedDict, Optional
import urllib.parse
from typing import List, Dict, TypedDict, Optional, Union
from bs4 import BeautifulSoup
from requests.exceptions import HTTPError
@@ -13,7 +14,7 @@ from tqdm.contrib.concurrent import thread_map
from .api import ItchApiClient
from .utils import ItchDownloadError, get_int_after_marker_in_json
from .consts import ITCH_GAME_URL_REGEX
from .infobox import parse_infobox
from .infobox import parse_infobox, InfoboxMetadata
TARGET_PATHS = {
'site': 'site.html',
@@ -25,11 +26,11 @@ TARGET_PATHS = {
class DownloadResult:
def __init__(self, url: str, success: bool, errors, external_urls: Optional[List[str]] = None):
def __init__(self, url: str, success: bool, errors, external_urls: List[str]):
self.url = url
self.success = success
self.errors = errors
self.external_urls = external_urls
self.errors = errors or []
self.external_urls = external_urls or []
class GameMetadata(TypedDict, total=False):
@@ -47,7 +48,11 @@ class GameMetadata(TypedDict, total=False):
screenshots: List[str]
description: str
rating: Dict[str, Union[float, int]]
extra: InfoboxMetadata
created_at: str
updated_at: str
released_at: str
published_at: str
@@ -60,7 +65,8 @@ class GameDownloader:
self.download_keys = keys
self.client = ItchApiClient(api_key)
def get_rating_json(self, site) -> Optional[dict]:
@staticmethod
def get_rating_json(site) -> Optional[dict]:
for ldjson_node in site.find_all("script", type="application/ld+json"):
try:
ldjson: dict = json.loads(ldjson_node.text.strip())
@@ -71,7 +77,8 @@ class GameDownloader:
return None
def get_meta(self, site, **kwargs) -> Optional[str]:
@staticmethod
def get_meta(site, **kwargs) -> Optional[str]:
"""Grabs <meta property="xyz" content="value"/> values."""
node = site.find("meta", attrs=kwargs)
if not node:
@@ -140,8 +147,34 @@ class GameDownloader:
infobox_div = site.find("div", class_="game_info_panel_widget")
if infobox_div:
infobox = parse_infobox(infobox_div)
for dt in ('created_at', 'updated_at', 'released_at', 'published_at'):
if dt in infobox:
# noinspection PyTypedDict
metadata[dt] = infobox[dt].isoformat()
del infobox[dt]
TODO_KEYS = ['author', 'author_url', 'created_at', 'released_at', 'published_at']
if 'author' in infobox:
metadata['author'] = infobox['author']['author']
metadata['author_url'] = infobox['author']['author_url']
del infobox['author']
if 'authors' in infobox and 'author' not in metadata:
# Some games may have multiple authors (ex. compilations).
metadata['author'] = "Multiple authors"
metadata['author_url'] = f"https://{urllib.parse.urlparse(url).netloc}"
metadata['extra'] = infobox
agg_rating = rating_json.get('aggregateRating')
if agg_rating:
try:
metadata['rating'] = {
'average': float(agg_rating['ratingValue']),
'votes': agg_rating['ratingCount']
}
except: # noqa
logging.exception("Could not extract the rating metadata...")
pass # Nope, just, don't
return metadata
@@ -179,7 +212,7 @@ class GameDownloader:
def download(self, url: str, skip_downloaded: bool = True):
match = re.match(ITCH_GAME_URL_REGEX, url)
if not match:
return DownloadResult(url, False, [f"Game URL is invalid: {url} - please file a new issue."])
return DownloadResult(url, False, [f"Game URL is invalid: {url} - please file a new issue."], [])
author, game = match['author'], match['game']
@@ -192,14 +225,14 @@ class GameDownloader:
# As metadata is the final file we write, all the files
# should already be downloaded at this point.
logging.info("Skipping already-downloaded game for URL: %s", url)
return DownloadResult(url, True, [f"Game already downloaded."])
return DownloadResult(url, True, [f"Game already downloaded."], [])
try:
logging.info("Downloading %s", url)
r = self.client.get(url, append_api_key=False)
r.raise_for_status()
except Exception as e:
return DownloadResult(url, False, [f"Could not download the game site for {url}: {e}"])
return DownloadResult(url, False, [f"Could not download the game site for {url}: {e}"], [])
site = BeautifulSoup(r.text, features="lxml")
try:
@@ -207,14 +240,14 @@ class GameDownloader:
metadata = self.extract_metadata(game_id, url, site)
title = metadata['title'] or game
except ItchDownloadError as e:
return DownloadResult(url, False, [str(e)])
return DownloadResult(url, False, [str(e)], [])
credentials = self.get_credentials(title, game_id)
try:
game_uploads_req = self.client.get(f"/games/{game_id}/uploads", data=credentials, timeout=15)
game_uploads_req.raise_for_status()
except Exception as e:
return DownloadResult(url, False, [f"Could not fetch game uploads for {title}: {e}"])
return DownloadResult(url, False, [f"Could not fetch game uploads for {title}: {e}"], [])
game_uploads = game_uploads_req.json()['uploads']
logging.debug("Found %d upload(s): %s", len(game_uploads), str(game_uploads))
@@ -264,17 +297,20 @@ class GameDownloader:
if len(external_urls) > 0:
logging.warning(f"Game {title} has external download URLs: {external_urls}")
# TODO: Screenshots and site assets
# TODO: Mirror JS/CSS assets
if self.mirror_web:
os.makedirs(paths['screenshots'], exist_ok=True)
for screenshot in metadata['screenshots']:
if not screenshot:
continue
file_name = os.path.basename(screenshot)
try:
self.download_file(screenshot, os.path.join(paths['screenshots'], file_name), credentials={})
except Exception as e:
errors.append(f"Screenshot download failed (this is not fatal): {e}")
if 'cover_url' in metadata:
if metadata.get('cover_url'):
try:
cover_url = metadata['cover_url']
self.download_file(cover_url, paths['cover'] + os.path.splitext(cover_url)[-1], credentials={})
@@ -285,7 +321,7 @@ class GameDownloader:
f.write(site.prettify())
with open(paths['metadata'], 'w') as f:
json.dump(metadata, f)
json.dump(metadata, f, indent=4)
if len(errors) > 0:
logging.error(f"Game {title} has download errors: {errors}")
@@ -294,7 +330,14 @@ class GameDownloader:
return DownloadResult(url, len(errors) == 0, errors, external_urls)
def drive_downloads(jobs: List[str], download_to: str, mirror_web: bool, api_key: str, keys: Dict[int, str], parallel: int = 1):
def drive_downloads(
jobs: List[str],
download_to: str,
mirror_web: bool,
api_key: str,
keys: Dict[int, str],
parallel: int = 1
):
downloader = GameDownloader(download_to, mirror_web, api_key, keys)
tqdm_args = {
"desc": "Games",