mirror of
https://github.com/DragoonAethis/itch-dl.git
synced 2024-12-21 02:21:52 +01:00
Implement screenshot/cover art downloads, initial infobox parsing
This commit is contained in:
parent
8a6bed69f4
commit
f5c0f4658d
@ -65,4 +65,4 @@ def run() -> int:
|
|||||||
# Grab all the download keys (there's no way to fetch them per title...):
|
# Grab all the download keys (there's no way to fetch them per title...):
|
||||||
keys = get_download_keys(client)
|
keys = get_download_keys(client)
|
||||||
|
|
||||||
return drive_downloads(jobs, download_to, args.api_key, keys, parallel=args.parallel)
|
return drive_downloads(jobs, download_to, args.mirror_web, args.api_key, keys, parallel=args.parallel)
|
||||||
|
@ -13,10 +13,11 @@ from tqdm.contrib.concurrent import thread_map
|
|||||||
from .api import ItchApiClient
|
from .api import ItchApiClient
|
||||||
from .utils import ItchDownloadError, get_int_after_marker_in_json
|
from .utils import ItchDownloadError, get_int_after_marker_in_json
|
||||||
from .consts import ITCH_GAME_URL_REGEX
|
from .consts import ITCH_GAME_URL_REGEX
|
||||||
|
from .infobox import parse_infobox
|
||||||
|
|
||||||
TARGET_PATHS = {
|
TARGET_PATHS = {
|
||||||
'site': 'site.html',
|
'site': 'site.html',
|
||||||
|
'cover': 'cover',
|
||||||
'metadata': 'metadata.json',
|
'metadata': 'metadata.json',
|
||||||
'files': 'files',
|
'files': 'files',
|
||||||
'screenshots': 'screenshots'
|
'screenshots': 'screenshots'
|
||||||
@ -42,18 +43,21 @@ class GameMetadata(TypedDict, total=False):
|
|||||||
author: str
|
author: str
|
||||||
author_url: str
|
author_url: str
|
||||||
|
|
||||||
description: str
|
|
||||||
cover_url: str
|
cover_url: str
|
||||||
|
screenshots: List[str]
|
||||||
|
description: str
|
||||||
|
|
||||||
created_at: str
|
created_at: str
|
||||||
|
released_at: str
|
||||||
published_at: str
|
published_at: str
|
||||||
|
|
||||||
|
|
||||||
class GameDownloader:
|
class GameDownloader:
|
||||||
def __init__(self, download_to: str, api_key: str, keys: Dict[int, str]):
|
def __init__(self, download_to: str, mirror_web: bool, api_key: str, keys: Dict[int, str]):
|
||||||
self.download_to = download_to
|
self.download_to = download_to
|
||||||
self.download_keys = keys
|
self.mirror_web = mirror_web
|
||||||
|
|
||||||
|
self.download_keys = keys
|
||||||
self.client = ItchApiClient(api_key)
|
self.client = ItchApiClient(api_key)
|
||||||
|
|
||||||
def get_rating_json(self, site) -> Optional[dict]:
|
def get_rating_json(self, site) -> Optional[dict]:
|
||||||
@ -112,20 +116,32 @@ class GameDownloader:
|
|||||||
return game_id
|
return game_id
|
||||||
|
|
||||||
def extract_metadata(self, game_id: int, url: str, site: BeautifulSoup) -> GameMetadata:
|
def extract_metadata(self, game_id: int, url: str, site: BeautifulSoup) -> GameMetadata:
|
||||||
|
rating_json: Optional[dict] = self.get_rating_json(site)
|
||||||
|
title = rating_json.get("name")
|
||||||
|
|
||||||
description: Optional[str] = self.get_meta(site, property="og:description")
|
description: Optional[str] = self.get_meta(site, property="og:description")
|
||||||
if not description:
|
if not description:
|
||||||
description = self.get_meta(site, name="description")
|
description = self.get_meta(site, name="description")
|
||||||
|
|
||||||
|
screenshot_urls: List[str] = []
|
||||||
|
screenshots_node = site.find("div", class_="screenshot_list")
|
||||||
|
if screenshots_node:
|
||||||
|
screenshot_urls = [a['href'] for a in screenshots_node.find_all('a')]
|
||||||
|
|
||||||
metadata = GameMetadata(
|
metadata = GameMetadata(
|
||||||
game_id=game_id,
|
game_id=game_id,
|
||||||
title=site.find("h1", class_="game_title").text.strip(),
|
title=title or site.find("h1", class_="game_title").text.strip(),
|
||||||
url=url,
|
url=url,
|
||||||
cover_url=self.get_meta(site, property="og:image"),
|
cover_url=self.get_meta(site, property="og:image"),
|
||||||
description=description
|
screenshots=screenshot_urls,
|
||||||
|
description=description,
|
||||||
)
|
)
|
||||||
|
|
||||||
TODO_KEYS = ['author', 'author_url', 'created_at', 'published_at']
|
infobox_div = site.find("div", class_="game_info_panel_widget")
|
||||||
TODO_rating_json: Optional[dict] = self.get_rating_json(site)
|
if infobox_div:
|
||||||
|
infobox = parse_infobox(infobox_div)
|
||||||
|
|
||||||
|
TODO_KEYS = ['author', 'author_url', 'created_at', 'released_at', 'published_at']
|
||||||
|
|
||||||
return metadata
|
return metadata
|
||||||
|
|
||||||
@ -137,17 +153,17 @@ class GameDownloader:
|
|||||||
|
|
||||||
return credentials
|
return credentials
|
||||||
|
|
||||||
def download_file(self, upload_id: int, download_path: Optional[str], creds: dict) -> str:
|
def download_file(self, url: str, download_path: Optional[str], credentials: dict) -> str:
|
||||||
"""Performs a request to download a given upload by its ID, optionally saves the
|
"""Performs a request to download a given file, optionally saves the
|
||||||
file to the provided path and returns the final URL that was downloaded."""
|
file to the provided path and returns the final URL that was downloaded."""
|
||||||
try:
|
try:
|
||||||
# No timeouts, chunked uploads, default retry strategy, should be all good?
|
# No timeouts, chunked uploads, default retry strategy, should be all good?
|
||||||
with self.client.get(f"/uploads/{upload_id}/download", data=creds, stream=True) as r:
|
with self.client.get(url, data=credentials, stream=True) as r:
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
|
|
||||||
if download_path is not None: # ...and it will be for external downloads.
|
if download_path is not None: # ...and it will be for external downloads.
|
||||||
with tqdm.wrapattr(open(download_path, "wb"), "write",
|
with tqdm.wrapattr(open(download_path, "wb"), "write",
|
||||||
miniters=1, desc=str(upload_id),
|
miniters=1, desc=url,
|
||||||
total=int(r.headers.get('content-length', 0))) as f:
|
total=int(r.headers.get('content-length', 0))) as f:
|
||||||
for chunk in r.iter_content(chunk_size=1048576): # 1MB chunks
|
for chunk in r.iter_content(chunk_size=1048576): # 1MB chunks
|
||||||
f.write(chunk)
|
f.write(chunk)
|
||||||
@ -156,6 +172,10 @@ class GameDownloader:
|
|||||||
except HTTPError as e:
|
except HTTPError as e:
|
||||||
raise ItchDownloadError(f"Unrecoverable download error: {e}")
|
raise ItchDownloadError(f"Unrecoverable download error: {e}")
|
||||||
|
|
||||||
|
def download_file_by_upload_id(self, upload_id: int, download_path: Optional[str], credentials: dict) -> str:
|
||||||
|
"""Performs a request to download a given upload by its ID."""
|
||||||
|
return self.download_file(f"/uploads/{upload_id}/download", download_path, credentials)
|
||||||
|
|
||||||
def download(self, url: str, skip_downloaded: bool = True):
|
def download(self, url: str, skip_downloaded: bool = True):
|
||||||
match = re.match(ITCH_GAME_URL_REGEX, url)
|
match = re.match(ITCH_GAME_URL_REGEX, url)
|
||||||
if not match:
|
if not match:
|
||||||
@ -218,7 +238,7 @@ class GameDownloader:
|
|||||||
target_path = None if upload_is_external else os.path.join(paths['files'], file_name)
|
target_path = None if upload_is_external else os.path.join(paths['files'], file_name)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
target_url = self.download_file(upload_id, target_path, credentials)
|
target_url = self.download_file_by_upload_id(upload_id, target_path, credentials)
|
||||||
except ItchDownloadError as e:
|
except ItchDownloadError as e:
|
||||||
errors.append(f"Download failed for upload {upload}: {e}")
|
errors.append(f"Download failed for upload {upload}: {e}")
|
||||||
continue
|
continue
|
||||||
@ -245,6 +265,22 @@ class GameDownloader:
|
|||||||
logging.warning(f"Game {title} has external download URLs: {external_urls}")
|
logging.warning(f"Game {title} has external download URLs: {external_urls}")
|
||||||
|
|
||||||
# TODO: Screenshots and site assets
|
# TODO: Screenshots and site assets
|
||||||
|
if self.mirror_web:
|
||||||
|
os.makedirs(paths['screenshots'], exist_ok=True)
|
||||||
|
for screenshot in metadata['screenshots']:
|
||||||
|
file_name = os.path.basename(screenshot)
|
||||||
|
try:
|
||||||
|
self.download_file(screenshot, os.path.join(paths['screenshots'], file_name), credentials={})
|
||||||
|
except Exception as e:
|
||||||
|
errors.append(f"Screenshot download failed (this is not fatal): {e}")
|
||||||
|
|
||||||
|
if 'cover_url' in metadata:
|
||||||
|
try:
|
||||||
|
cover_url = metadata['cover_url']
|
||||||
|
self.download_file(cover_url, paths['cover'] + os.path.splitext(cover_url)[-1], credentials={})
|
||||||
|
except Exception as e:
|
||||||
|
errors.append(f"Cover art download failed (this is not fatal): {e}")
|
||||||
|
|
||||||
with open(paths['site'], 'w') as f:
|
with open(paths['site'], 'w') as f:
|
||||||
f.write(site.prettify())
|
f.write(site.prettify())
|
||||||
|
|
||||||
@ -255,11 +291,11 @@ class GameDownloader:
|
|||||||
logging.error(f"Game {title} has download errors: {errors}")
|
logging.error(f"Game {title} has download errors: {errors}")
|
||||||
|
|
||||||
logging.info("Finished job %s (%s)", url, title)
|
logging.info("Finished job %s (%s)", url, title)
|
||||||
return DownloadResult(url, True, errors, external_urls)
|
return DownloadResult(url, len(errors) == 0, errors, external_urls)
|
||||||
|
|
||||||
|
|
||||||
def drive_downloads(jobs: List[str], download_to: str, api_key: str, keys: Dict[int, str], parallel: int = 1):
|
def drive_downloads(jobs: List[str], download_to: str, mirror_web: bool, api_key: str, keys: Dict[int, str], parallel: int = 1):
|
||||||
downloader = GameDownloader(download_to, api_key, keys)
|
downloader = GameDownloader(download_to, mirror_web, api_key, keys)
|
||||||
tqdm_args = {
|
tqdm_args = {
|
||||||
"desc": "Games",
|
"desc": "Games",
|
||||||
"unit": "game",
|
"unit": "game",
|
||||||
|
47
itch_dl/infobox.py
Normal file
47
itch_dl/infobox.py
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
from datetime import datetime
|
||||||
|
from typing import TypedDict, Dict, List, Any, Tuple, Optional
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
class InfoboxMetadata(TypedDict, total=False):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def parse_date_block(td: BeautifulSoup) -> datetime:
|
||||||
|
raise NotImplementedError("Not yet!")
|
||||||
|
|
||||||
|
|
||||||
|
def parse_links(td: BeautifulSoup) -> Dict[str, str]:
|
||||||
|
"""Parses blocks of comma-separated <a> blocks, returns a dict
|
||||||
|
of link text -> URL it points at."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def parse_text_from_links(td: BeautifulSoup) -> List[str]:
|
||||||
|
return list(parse_links(td).keys())
|
||||||
|
|
||||||
|
|
||||||
|
def parse_tr(name: str, content: BeautifulSoup) -> Optional[Tuple[str, Any]]:
|
||||||
|
if name == "Updated":
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def parse_infobox(infobox: BeautifulSoup) -> dict:
|
||||||
|
"""Feed it <div class="game_info_panel_widget">, out goes a dict
|
||||||
|
of parsed metadata blocks."""
|
||||||
|
meta = InfoboxMetadata()
|
||||||
|
|
||||||
|
for tr in infobox.find_all("tr"):
|
||||||
|
tds = tr.find_all("td")
|
||||||
|
if len(tds) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
name_td, content_td = tds[0], tds[1]
|
||||||
|
name = name_td.text.strip()
|
||||||
|
|
||||||
|
parsed_block = parse_tr(name, content_td)
|
||||||
|
if parsed_block:
|
||||||
|
meta[parsed_block[0]] = parsed_block[1]
|
||||||
|
|
||||||
|
return meta
|
Loading…
Reference in New Issue
Block a user