Add filtering job URLs by regex/globs

2025-04-21 03:51:18 +02:00 · 2025-04-03 18:50:33 +02:00 · 2025-04-03 18:50:33 +02:00 · a092532192
commit a092532192
parent 5ab0dc0309
4 changed files with 50 additions and 3 deletions
--- a/itch_dl/cli.py
+++ b/itch_dl/cli.py
@ -54,6 +54,12 @@ def parse_args() -> argparse.Namespace:
                        help="filter downloaded files with a shell-style glob/fnmatch (unmatched files are skipped)")
    parser.add_argument("--filter-files-regex", metavar="regex", default=None,
                        help="filter downloaded files with a Python regex (unmatched files are skipped)")
+
+    parser.add_argument("--filter-urls-glob", metavar="glob", default=None,
+                        help="filter itch URLs with a shell-style glob/fnmatch (unmatched URLs are skipped)")
+    parser.add_argument("--filter-urls-regex", metavar="regex", default=None,
+                        help="filter itch URLs with a Python regex (unmatched URLs are skipped)")
+
    parser.add_argument("--verbose", action="store_true",
                        help="print verbose logs")

@ -89,8 +95,10 @@ def run() -> int:
        )

    jobs = get_jobs_for_url_or_path(url_or_path, settings)
-    jobs = list(set(jobs))  # Deduplicate, just in case...
-    logging.info("Found %d URL(s).", len(jobs))
+    logging.info("Found %d URL(s) total.", len(jobs))
+
+    jobs = preprocess_job_urls(jobs, settings)
+    logging.info("Will process %d URL(s) after filtering and deduplication.", len(jobs))

    if len(jobs) == 0:
        sys.exit("No URLs to download.")
--- a/itch_dl/config.py
+++ b/itch_dl/config.py
@ -27,6 +27,9 @@ class Settings:
    filter_files_glob: str | None = None
    filter_files_regex: str | None = None

+    filter_urls_glob: str | None = None
+    filter_urls_regex: str | None = None
+
    verbose: bool = False


--- a/itch_dl/handlers.py
+++ b/itch_dl/handlers.py
@ -7,7 +7,7 @@ from http.client import responses
 from bs4 import BeautifulSoup

 from .api import ItchApiClient
-from .utils import ItchDownloadError, get_int_after_marker_in_json
+from .utils import ItchDownloadError, get_int_after_marker_in_json, should_skip_item_by_glob, should_skip_item_by_regex
 from .consts import ITCH_API, ITCH_BASE, ITCH_URL, ITCH_BROWSER_TYPES
 from .config import Settings
 from .keys import get_owned_games
@ -251,3 +251,19 @@ def get_jobs_for_url_or_path(path_or_url: str, settings: Settings) -> list[str]:
        return get_jobs_for_path(path_or_url)
    else:
        raise NotImplementedError(f"Cannot handle path or URL: {path_or_url}")
+
+
+def preprocess_job_urls(jobs: list[str], settings: Settings) -> list[str]:
+    cleaned_jobs = set()
+    for job in jobs:
+        job = job.strip()
+
+        if should_skip_item_by_glob("URL", job, settings.filter_urls_glob):
+            continue
+
+        if should_skip_item_by_regex("URL", job, settings.filter_urls_regex):
+            continue
+
+        cleaned_jobs.add(job)
+
+    return list(cleaned_jobs)
--- a/itch_dl/utils.py
+++ b/itch_dl/utils.py
@ -1,4 +1,8 @@
 import re
+import logging
+from fnmatch import fnmatch
+
+from typing import Literal


 class ItchDownloadError(Exception):
@ -31,3 +35,19 @@ def get_int_after_marker_in_json(text: str, marker: str, key: str) -> int | None
        return None

    return int(found_ints[0])
+
+
+def should_skip_item_by_glob(kind: Literal['File'] | Literal['URL'], item: str, glob: str):
+    if glob and not fnmatch(item, glob):
+        logging.info("%s '%s' does not match the glob filter '%s', skipping", kind, item, glob)
+        return True
+
+    return False
+
+
+def should_skip_item_by_regex(kind: Literal['File'] | Literal['URL'], item: str, regex: str):
+    if regex and not re.fullmatch(regex, item):
+        logging.info("%s '%s' does not match the regex filter '%s', skipping", kind, item, regex)
+        return True
+
+    return False