diff --git a/itch_dl/cli.py b/itch_dl/cli.py index 5cdcb83..20de8bc 100644 --- a/itch_dl/cli.py +++ b/itch_dl/cli.py @@ -65,4 +65,4 @@ def run() -> int: # Grab all the download keys (there's no way to fetch them per title...): keys = get_download_keys(client) - return drive_downloads(jobs, download_to, args.api_key, keys, parallel=args.parallel) + return drive_downloads(jobs, download_to, args.mirror_web, args.api_key, keys, parallel=args.parallel) diff --git a/itch_dl/downloader.py b/itch_dl/downloader.py index 174533e..231a1fa 100644 --- a/itch_dl/downloader.py +++ b/itch_dl/downloader.py @@ -13,10 +13,11 @@ from tqdm.contrib.concurrent import thread_map from .api import ItchApiClient from .utils import ItchDownloadError, get_int_after_marker_in_json from .consts import ITCH_GAME_URL_REGEX - +from .infobox import parse_infobox TARGET_PATHS = { 'site': 'site.html', + 'cover': 'cover', 'metadata': 'metadata.json', 'files': 'files', 'screenshots': 'screenshots' @@ -42,18 +43,21 @@ class GameMetadata(TypedDict, total=False): author: str author_url: str - description: str cover_url: str + screenshots: List[str] + description: str created_at: str + released_at: str published_at: str class GameDownloader: - def __init__(self, download_to: str, api_key: str, keys: Dict[int, str]): + def __init__(self, download_to: str, mirror_web: bool, api_key: str, keys: Dict[int, str]): self.download_to = download_to - self.download_keys = keys + self.mirror_web = mirror_web + self.download_keys = keys self.client = ItchApiClient(api_key) def get_rating_json(self, site) -> Optional[dict]: @@ -112,20 +116,32 @@ class GameDownloader: return game_id def extract_metadata(self, game_id: int, url: str, site: BeautifulSoup) -> GameMetadata: + rating_json: Optional[dict] = self.get_rating_json(site) + title = rating_json.get("name") + description: Optional[str] = self.get_meta(site, property="og:description") if not description: description = self.get_meta(site, name="description") + screenshot_urls: List[str] = [] + screenshots_node = site.find("div", class_="screenshot_list") + if screenshots_node: + screenshot_urls = [a['href'] for a in screenshots_node.find_all('a')] + metadata = GameMetadata( game_id=game_id, - title=site.find("h1", class_="game_title").text.strip(), + title=title or site.find("h1", class_="game_title").text.strip(), url=url, cover_url=self.get_meta(site, property="og:image"), - description=description + screenshots=screenshot_urls, + description=description, ) - TODO_KEYS = ['author', 'author_url', 'created_at', 'published_at'] - TODO_rating_json: Optional[dict] = self.get_rating_json(site) + infobox_div = site.find("div", class_="game_info_panel_widget") + if infobox_div: + infobox = parse_infobox(infobox_div) + + TODO_KEYS = ['author', 'author_url', 'created_at', 'released_at', 'published_at'] return metadata @@ -137,17 +153,17 @@ class GameDownloader: return credentials - def download_file(self, upload_id: int, download_path: Optional[str], creds: dict) -> str: - """Performs a request to download a given upload by its ID, optionally saves the + def download_file(self, url: str, download_path: Optional[str], credentials: dict) -> str: + """Performs a request to download a given file, optionally saves the file to the provided path and returns the final URL that was downloaded.""" try: # No timeouts, chunked uploads, default retry strategy, should be all good? - with self.client.get(f"/uploads/{upload_id}/download", data=creds, stream=True) as r: + with self.client.get(url, data=credentials, stream=True) as r: r.raise_for_status() if download_path is not None: # ...and it will be for external downloads. with tqdm.wrapattr(open(download_path, "wb"), "write", - miniters=1, desc=str(upload_id), + miniters=1, desc=url, total=int(r.headers.get('content-length', 0))) as f: for chunk in r.iter_content(chunk_size=1048576): # 1MB chunks f.write(chunk) @@ -156,6 +172,10 @@ class GameDownloader: except HTTPError as e: raise ItchDownloadError(f"Unrecoverable download error: {e}") + def download_file_by_upload_id(self, upload_id: int, download_path: Optional[str], credentials: dict) -> str: + """Performs a request to download a given upload by its ID.""" + return self.download_file(f"/uploads/{upload_id}/download", download_path, credentials) + def download(self, url: str, skip_downloaded: bool = True): match = re.match(ITCH_GAME_URL_REGEX, url) if not match: @@ -218,7 +238,7 @@ class GameDownloader: target_path = None if upload_is_external else os.path.join(paths['files'], file_name) try: - target_url = self.download_file(upload_id, target_path, credentials) + target_url = self.download_file_by_upload_id(upload_id, target_path, credentials) except ItchDownloadError as e: errors.append(f"Download failed for upload {upload}: {e}") continue @@ -245,6 +265,22 @@ class GameDownloader: logging.warning(f"Game {title} has external download URLs: {external_urls}") # TODO: Screenshots and site assets + if self.mirror_web: + os.makedirs(paths['screenshots'], exist_ok=True) + for screenshot in metadata['screenshots']: + file_name = os.path.basename(screenshot) + try: + self.download_file(screenshot, os.path.join(paths['screenshots'], file_name), credentials={}) + except Exception as e: + errors.append(f"Screenshot download failed (this is not fatal): {e}") + + if 'cover_url' in metadata: + try: + cover_url = metadata['cover_url'] + self.download_file(cover_url, paths['cover'] + os.path.splitext(cover_url)[-1], credentials={}) + except Exception as e: + errors.append(f"Cover art download failed (this is not fatal): {e}") + with open(paths['site'], 'w') as f: f.write(site.prettify()) @@ -255,11 +291,11 @@ class GameDownloader: logging.error(f"Game {title} has download errors: {errors}") logging.info("Finished job %s (%s)", url, title) - return DownloadResult(url, True, errors, external_urls) + return DownloadResult(url, len(errors) == 0, errors, external_urls) -def drive_downloads(jobs: List[str], download_to: str, api_key: str, keys: Dict[int, str], parallel: int = 1): - downloader = GameDownloader(download_to, api_key, keys) +def drive_downloads(jobs: List[str], download_to: str, mirror_web: bool, api_key: str, keys: Dict[int, str], parallel: int = 1): + downloader = GameDownloader(download_to, mirror_web, api_key, keys) tqdm_args = { "desc": "Games", "unit": "game", diff --git a/itch_dl/infobox.py b/itch_dl/infobox.py new file mode 100644 index 0000000..157d533 --- /dev/null +++ b/itch_dl/infobox.py @@ -0,0 +1,47 @@ +from datetime import datetime +from typing import TypedDict, Dict, List, Any, Tuple, Optional + +from bs4 import BeautifulSoup + + +class InfoboxMetadata(TypedDict, total=False): + pass + + +def parse_date_block(td: BeautifulSoup) -> datetime: + raise NotImplementedError("Not yet!") + + +def parse_links(td: BeautifulSoup) -> Dict[str, str]: + """Parses blocks of comma-separated blocks, returns a dict + of link text -> URL it points at.""" + pass + + +def parse_text_from_links(td: BeautifulSoup) -> List[str]: + return list(parse_links(td).keys()) + + +def parse_tr(name: str, content: BeautifulSoup) -> Optional[Tuple[str, Any]]: + if name == "Updated": + pass + + +def parse_infobox(infobox: BeautifulSoup) -> dict: + """Feed it
, out goes a dict + of parsed metadata blocks.""" + meta = InfoboxMetadata() + + for tr in infobox.find_all("tr"): + tds = tr.find_all("td") + if len(tds) < 2: + continue + + name_td, content_td = tds[0], tds[1] + name = name_td.text.strip() + + parsed_block = parse_tr(name, content_td) + if parsed_block: + meta[parsed_block[0]] = parsed_block[1] + + return meta