Implement screenshot/cover art downloads, initial infobox parsing

This commit is contained in:
Ryszard Knop 2022-05-15 18:51:13 +02:00
parent 8a6bed69f4
commit f5c0f4658d
3 changed files with 100 additions and 17 deletions

View File

@ -65,4 +65,4 @@ def run() -> int:
# Grab all the download keys (there's no way to fetch them per title...): # Grab all the download keys (there's no way to fetch them per title...):
keys = get_download_keys(client) keys = get_download_keys(client)
return drive_downloads(jobs, download_to, args.api_key, keys, parallel=args.parallel) return drive_downloads(jobs, download_to, args.mirror_web, args.api_key, keys, parallel=args.parallel)

View File

@ -13,10 +13,11 @@ from tqdm.contrib.concurrent import thread_map
from .api import ItchApiClient from .api import ItchApiClient
from .utils import ItchDownloadError, get_int_after_marker_in_json from .utils import ItchDownloadError, get_int_after_marker_in_json
from .consts import ITCH_GAME_URL_REGEX from .consts import ITCH_GAME_URL_REGEX
from .infobox import parse_infobox
TARGET_PATHS = { TARGET_PATHS = {
'site': 'site.html', 'site': 'site.html',
'cover': 'cover',
'metadata': 'metadata.json', 'metadata': 'metadata.json',
'files': 'files', 'files': 'files',
'screenshots': 'screenshots' 'screenshots': 'screenshots'
@ -42,18 +43,21 @@ class GameMetadata(TypedDict, total=False):
author: str author: str
author_url: str author_url: str
description: str
cover_url: str cover_url: str
screenshots: List[str]
description: str
created_at: str created_at: str
released_at: str
published_at: str published_at: str
class GameDownloader: class GameDownloader:
def __init__(self, download_to: str, api_key: str, keys: Dict[int, str]): def __init__(self, download_to: str, mirror_web: bool, api_key: str, keys: Dict[int, str]):
self.download_to = download_to self.download_to = download_to
self.download_keys = keys self.mirror_web = mirror_web
self.download_keys = keys
self.client = ItchApiClient(api_key) self.client = ItchApiClient(api_key)
def get_rating_json(self, site) -> Optional[dict]: def get_rating_json(self, site) -> Optional[dict]:
@ -112,20 +116,32 @@ class GameDownloader:
return game_id return game_id
def extract_metadata(self, game_id: int, url: str, site: BeautifulSoup) -> GameMetadata: def extract_metadata(self, game_id: int, url: str, site: BeautifulSoup) -> GameMetadata:
rating_json: Optional[dict] = self.get_rating_json(site)
title = rating_json.get("name")
description: Optional[str] = self.get_meta(site, property="og:description") description: Optional[str] = self.get_meta(site, property="og:description")
if not description: if not description:
description = self.get_meta(site, name="description") description = self.get_meta(site, name="description")
screenshot_urls: List[str] = []
screenshots_node = site.find("div", class_="screenshot_list")
if screenshots_node:
screenshot_urls = [a['href'] for a in screenshots_node.find_all('a')]
metadata = GameMetadata( metadata = GameMetadata(
game_id=game_id, game_id=game_id,
title=site.find("h1", class_="game_title").text.strip(), title=title or site.find("h1", class_="game_title").text.strip(),
url=url, url=url,
cover_url=self.get_meta(site, property="og:image"), cover_url=self.get_meta(site, property="og:image"),
description=description screenshots=screenshot_urls,
description=description,
) )
TODO_KEYS = ['author', 'author_url', 'created_at', 'published_at'] infobox_div = site.find("div", class_="game_info_panel_widget")
TODO_rating_json: Optional[dict] = self.get_rating_json(site) if infobox_div:
infobox = parse_infobox(infobox_div)
TODO_KEYS = ['author', 'author_url', 'created_at', 'released_at', 'published_at']
return metadata return metadata
@ -137,17 +153,17 @@ class GameDownloader:
return credentials return credentials
def download_file(self, upload_id: int, download_path: Optional[str], creds: dict) -> str: def download_file(self, url: str, download_path: Optional[str], credentials: dict) -> str:
"""Performs a request to download a given upload by its ID, optionally saves the """Performs a request to download a given file, optionally saves the
file to the provided path and returns the final URL that was downloaded.""" file to the provided path and returns the final URL that was downloaded."""
try: try:
# No timeouts, chunked uploads, default retry strategy, should be all good? # No timeouts, chunked uploads, default retry strategy, should be all good?
with self.client.get(f"/uploads/{upload_id}/download", data=creds, stream=True) as r: with self.client.get(url, data=credentials, stream=True) as r:
r.raise_for_status() r.raise_for_status()
if download_path is not None: # ...and it will be for external downloads. if download_path is not None: # ...and it will be for external downloads.
with tqdm.wrapattr(open(download_path, "wb"), "write", with tqdm.wrapattr(open(download_path, "wb"), "write",
miniters=1, desc=str(upload_id), miniters=1, desc=url,
total=int(r.headers.get('content-length', 0))) as f: total=int(r.headers.get('content-length', 0))) as f:
for chunk in r.iter_content(chunk_size=1048576): # 1MB chunks for chunk in r.iter_content(chunk_size=1048576): # 1MB chunks
f.write(chunk) f.write(chunk)
@ -156,6 +172,10 @@ class GameDownloader:
except HTTPError as e: except HTTPError as e:
raise ItchDownloadError(f"Unrecoverable download error: {e}") raise ItchDownloadError(f"Unrecoverable download error: {e}")
def download_file_by_upload_id(self, upload_id: int, download_path: Optional[str], credentials: dict) -> str:
"""Performs a request to download a given upload by its ID."""
return self.download_file(f"/uploads/{upload_id}/download", download_path, credentials)
def download(self, url: str, skip_downloaded: bool = True): def download(self, url: str, skip_downloaded: bool = True):
match = re.match(ITCH_GAME_URL_REGEX, url) match = re.match(ITCH_GAME_URL_REGEX, url)
if not match: if not match:
@ -218,7 +238,7 @@ class GameDownloader:
target_path = None if upload_is_external else os.path.join(paths['files'], file_name) target_path = None if upload_is_external else os.path.join(paths['files'], file_name)
try: try:
target_url = self.download_file(upload_id, target_path, credentials) target_url = self.download_file_by_upload_id(upload_id, target_path, credentials)
except ItchDownloadError as e: except ItchDownloadError as e:
errors.append(f"Download failed for upload {upload}: {e}") errors.append(f"Download failed for upload {upload}: {e}")
continue continue
@ -245,6 +265,22 @@ class GameDownloader:
logging.warning(f"Game {title} has external download URLs: {external_urls}") logging.warning(f"Game {title} has external download URLs: {external_urls}")
# TODO: Screenshots and site assets # TODO: Screenshots and site assets
if self.mirror_web:
os.makedirs(paths['screenshots'], exist_ok=True)
for screenshot in metadata['screenshots']:
file_name = os.path.basename(screenshot)
try:
self.download_file(screenshot, os.path.join(paths['screenshots'], file_name), credentials={})
except Exception as e:
errors.append(f"Screenshot download failed (this is not fatal): {e}")
if 'cover_url' in metadata:
try:
cover_url = metadata['cover_url']
self.download_file(cover_url, paths['cover'] + os.path.splitext(cover_url)[-1], credentials={})
except Exception as e:
errors.append(f"Cover art download failed (this is not fatal): {e}")
with open(paths['site'], 'w') as f: with open(paths['site'], 'w') as f:
f.write(site.prettify()) f.write(site.prettify())
@ -255,11 +291,11 @@ class GameDownloader:
logging.error(f"Game {title} has download errors: {errors}") logging.error(f"Game {title} has download errors: {errors}")
logging.info("Finished job %s (%s)", url, title) logging.info("Finished job %s (%s)", url, title)
return DownloadResult(url, True, errors, external_urls) return DownloadResult(url, len(errors) == 0, errors, external_urls)
def drive_downloads(jobs: List[str], download_to: str, api_key: str, keys: Dict[int, str], parallel: int = 1): def drive_downloads(jobs: List[str], download_to: str, mirror_web: bool, api_key: str, keys: Dict[int, str], parallel: int = 1):
downloader = GameDownloader(download_to, api_key, keys) downloader = GameDownloader(download_to, mirror_web, api_key, keys)
tqdm_args = { tqdm_args = {
"desc": "Games", "desc": "Games",
"unit": "game", "unit": "game",

47
itch_dl/infobox.py Normal file
View File

@ -0,0 +1,47 @@
from datetime import datetime
from typing import TypedDict, Dict, List, Any, Tuple, Optional
from bs4 import BeautifulSoup
class InfoboxMetadata(TypedDict, total=False):
pass
def parse_date_block(td: BeautifulSoup) -> datetime:
raise NotImplementedError("Not yet!")
def parse_links(td: BeautifulSoup) -> Dict[str, str]:
"""Parses blocks of comma-separated <a> blocks, returns a dict
of link text -> URL it points at."""
pass
def parse_text_from_links(td: BeautifulSoup) -> List[str]:
return list(parse_links(td).keys())
def parse_tr(name: str, content: BeautifulSoup) -> Optional[Tuple[str, Any]]:
if name == "Updated":
pass
def parse_infobox(infobox: BeautifulSoup) -> dict:
"""Feed it <div class="game_info_panel_widget">, out goes a dict
of parsed metadata blocks."""
meta = InfoboxMetadata()
for tr in infobox.find_all("tr"):
tds = tr.find_all("td")
if len(tds) < 2:
continue
name_td, content_td = tds[0], tds[1]
name = name_td.text.strip()
parsed_block = parse_tr(name, content_td)
if parsed_block:
meta[parsed_block[0]] = parsed_block[1]
return meta