Add filtering job URLs by regex/globs

2025-04-03 18:50:33 +02:00
parent 5ab0dc0309
commit a092532192
4 changed files with 50 additions and 3 deletions
--- a/itch_dl/cli.py
+++ b/itch_dl/cli.py
@@ -54,6 +54,12 @@ def parse_args() -> argparse.Namespace:
                        help="filter downloaded files with a shell-style glob/fnmatch (unmatched files are skipped)")
    parser.add_argument("--filter-files-regex", metavar="regex", default=None,
                        help="filter downloaded files with a Python regex (unmatched files are skipped)")
    parser.add_argument("--filter-urls-glob", metavar="glob", default=None,
                        help="filter itch URLs with a shell-style glob/fnmatch (unmatched URLs are skipped)")
    parser.add_argument("--filter-urls-regex", metavar="regex", default=None,
                        help="filter itch URLs with a Python regex (unmatched URLs are skipped)")
    parser.add_argument("--verbose", action="store_true",
                        help="print verbose logs")
@@ -89,8 +95,10 @@ def run() -> int:
        )
    jobs = get_jobs_for_url_or_path(url_or_path, settings)
-    jobs = list(set(jobs))  # Deduplicate, just in case...
+    logging.info("Found %d URL(s) total.", len(jobs))
-    logging.info("Found %d URL(s).", len(jobs))
+
    jobs = preprocess_job_urls(jobs, settings)
    logging.info("Will process %d URL(s) after filtering and deduplication.", len(jobs))
    if len(jobs) == 0:
        sys.exit("No URLs to download.")
--- a/itch_dl/config.py
+++ b/itch_dl/config.py
@@ -27,6 +27,9 @@ class Settings:
    filter_files_glob: str | None = None
    filter_files_regex: str | None = None
    filter_urls_glob: str | None = None
    filter_urls_regex: str | None = None
    verbose: bool = False
--- a/itch_dl/handlers.py
+++ b/itch_dl/handlers.py
@@ -7,7 +7,7 @@ from http.client import responses
 from bs4 import BeautifulSoup
 from .api import ItchApiClient
-from .utils import ItchDownloadError, get_int_after_marker_in_json
+from .utils import ItchDownloadError, get_int_after_marker_in_json, should_skip_item_by_glob, should_skip_item_by_regex
 from .consts import ITCH_API, ITCH_BASE, ITCH_URL, ITCH_BROWSER_TYPES
 from .config import Settings
 from .keys import get_owned_games
@@ -251,3 +251,19 @@ def get_jobs_for_url_or_path(path_or_url: str, settings: Settings) -> list[str]:
        return get_jobs_for_path(path_or_url)
    else:
        raise NotImplementedError(f"Cannot handle path or URL: {path_or_url}")
 def preprocess_job_urls(jobs: list[str], settings: Settings) -> list[str]:
    cleaned_jobs = set()
    for job in jobs:
        job = job.strip()
        if should_skip_item_by_glob("URL", job, settings.filter_urls_glob):
            continue
        if should_skip_item_by_regex("URL", job, settings.filter_urls_regex):
            continue
        cleaned_jobs.add(job)
    return list(cleaned_jobs)
--- a/itch_dl/utils.py
+++ b/itch_dl/utils.py
@@ -1,4 +1,8 @@
 import re
 import logging
 from fnmatch import fnmatch
 from typing import Literal
 class ItchDownloadError(Exception):
@@ -31,3 +35,19 @@ def get_int_after_marker_in_json(text: str, marker: str, key: str) -> int | None
        return None
    return int(found_ints[0])
 def should_skip_item_by_glob(kind: Literal['File'] | Literal['URL'], item: str, glob: str):
    if glob and not fnmatch(item, glob):
        logging.info("%s '%s' does not match the glob filter '%s', skipping", kind, item, glob)
        return True
    return False
 def should_skip_item_by_regex(kind: Literal['File'] | Literal['URL'], item: str, regex: str):
    if regex and not re.fullmatch(regex, item):
        logging.info("%s '%s' does not match the regex filter '%s', skipping", kind, item, regex)
        return True
    return False