Add filtering job URLs by regex/globs

This commit is contained in:
Ryszard Knop 2025-04-03 18:50:33 +02:00
parent 5ab0dc0309
commit a092532192
4 changed files with 50 additions and 3 deletions

View File

@ -54,6 +54,12 @@ def parse_args() -> argparse.Namespace:
help="filter downloaded files with a shell-style glob/fnmatch (unmatched files are skipped)")
parser.add_argument("--filter-files-regex", metavar="regex", default=None,
help="filter downloaded files with a Python regex (unmatched files are skipped)")
parser.add_argument("--filter-urls-glob", metavar="glob", default=None,
help="filter itch URLs with a shell-style glob/fnmatch (unmatched URLs are skipped)")
parser.add_argument("--filter-urls-regex", metavar="regex", default=None,
help="filter itch URLs with a Python regex (unmatched URLs are skipped)")
parser.add_argument("--verbose", action="store_true",
help="print verbose logs")
@ -89,8 +95,10 @@ def run() -> int:
)
jobs = get_jobs_for_url_or_path(url_or_path, settings)
jobs = list(set(jobs)) # Deduplicate, just in case...
logging.info("Found %d URL(s).", len(jobs))
logging.info("Found %d URL(s) total.", len(jobs))
jobs = preprocess_job_urls(jobs, settings)
logging.info("Will process %d URL(s) after filtering and deduplication.", len(jobs))
if len(jobs) == 0:
sys.exit("No URLs to download.")

View File

@ -27,6 +27,9 @@ class Settings:
filter_files_glob: str | None = None
filter_files_regex: str | None = None
filter_urls_glob: str | None = None
filter_urls_regex: str | None = None
verbose: bool = False

View File

@ -7,7 +7,7 @@ from http.client import responses
from bs4 import BeautifulSoup
from .api import ItchApiClient
from .utils import ItchDownloadError, get_int_after_marker_in_json
from .utils import ItchDownloadError, get_int_after_marker_in_json, should_skip_item_by_glob, should_skip_item_by_regex
from .consts import ITCH_API, ITCH_BASE, ITCH_URL, ITCH_BROWSER_TYPES
from .config import Settings
from .keys import get_owned_games
@ -251,3 +251,19 @@ def get_jobs_for_url_or_path(path_or_url: str, settings: Settings) -> list[str]:
return get_jobs_for_path(path_or_url)
else:
raise NotImplementedError(f"Cannot handle path or URL: {path_or_url}")
def preprocess_job_urls(jobs: list[str], settings: Settings) -> list[str]:
cleaned_jobs = set()
for job in jobs:
job = job.strip()
if should_skip_item_by_glob("URL", job, settings.filter_urls_glob):
continue
if should_skip_item_by_regex("URL", job, settings.filter_urls_regex):
continue
cleaned_jobs.add(job)
return list(cleaned_jobs)

View File

@ -1,4 +1,8 @@
import re
import logging
from fnmatch import fnmatch
from typing import Literal
class ItchDownloadError(Exception):
@ -31,3 +35,19 @@ def get_int_after_marker_in_json(text: str, marker: str, key: str) -> int | None
return None
return int(found_ints[0])
def should_skip_item_by_glob(kind: Literal['File'] | Literal['URL'], item: str, glob: str):
if glob and not fnmatch(item, glob):
logging.info("%s '%s' does not match the glob filter '%s', skipping", kind, item, glob)
return True
return False
def should_skip_item_by_regex(kind: Literal['File'] | Literal['URL'], item: str, regex: str):
if regex and not re.fullmatch(regex, item):
logging.info("%s '%s' does not match the regex filter '%s', skipping", kind, item, regex)
return True
return False