import os import json import parsel import logging import requests logger = logging.getLogger(__name__) def _clean_name(dirty_str): allowed_chars = (' ', '_', '.', '-', '[', ']') clean = [] for c in dirty_str.replace('+', '_').replace(':', ' -'): if c.isalpha() or c.isdigit() or c in allowed_chars: clean.append(c) return "".join(clean).strip() def download_library(cookie_path, library_path, progress_bar=False, ext_include=None, ext_exclude=None, platform_include=None, purchase_keys=None): if ext_include is None: ext_include = [] ext_include = list(map(str.lower, ext_include)) if ext_exclude is None: ext_exclude = [] ext_exclude = list(map(str.lower, ext_exclude)) if platform_include is None or 'all' in platform_include: # if all then we do not even need to use this feature platform_include = [] platform_include = list(map(str.lower, platform_include)) # Load cookies with open(cookie_path, 'r') as f: account_cookies = f.read() cache_file = os.path.join(library_path, '.cache.json') try: with open(cache_file, 'r') as f: cache_data = json.load(f) except FileNotFoundError: cache_data = {} if not purchase_keys: library_r = requests.get('https://www.humblebundle.com/home/library', headers={'cookie': account_cookies}) logger.debug("Library request: " + str(library_r)) library_page = parsel.Selector(text=library_r.text) orders_json = json.loads(library_page.css('#user-home-json-data') .xpath('string()').extract_first()) purchase_keys = orders_json['gamekeys'] for order_id in purchase_keys: order_url = 'https://www.humblebundle.com/api/v1/order/{order_id}?all_tpkds=true'.format(order_id=order_id) # noqa: E501 order_r = requests.get(order_url, headers={'cookie': account_cookies}) logger.debug("Order request: " + str(order_r)) order = order_r.json() bundle_title = _clean_name(order['product']['human_name']) logger.info("Checking bundle: " + str(bundle_title)) for item in order['subproducts']: item_title = _clean_name(item['human_name']) # Get all types of download for a product for download_type in item['downloads']: # Type of product, ebook, videos, etc... platform = download_type['platform'].lower() if (platform_include and platform not in platform_include): logger.info("Do not want " + platform + ", Skipping " + item_title) continue item_folder = os.path.join( library_path, bundle_title, item_title ) # Create directory to save the files to try: os.makedirs(item_folder) # noqa: E701 except OSError: pass # noqa: E701 # Download each file type of a product for file_type in download_type['download_struct']: try: url = file_type['url']['web'] except KeyError: logger.info("No url found for " + bundle_title + "/" + item_title) continue url_filename = url.split('?')[0].split('/')[-1] cache_file_key = order_id + ':' + url_filename ext = url_filename.split('.')[-1] # Only get the file types we care about if ((ext_include and ext.lower() not in ext_include) or (ext_exclude and ext.lower() in ext_exclude)): logger.info("Skipping the file " + url_filename) continue filename = os.path.join(item_folder, url_filename) item_r = requests.get(url, stream=True) logger.debug("Item request: {item_r}, Url: {url}" .format(item_r=item_r, url=url)) # Not sure which value will be best to use, so use them all file_info = { 'md5': file_type.get('md5'), 'sha1': file_type.get('sha1'), 'url_last_modified': item_r.headers['Last-Modified'], 'url_etag': item_r.headers['ETag'][1:-1], 'url_crc': item_r.headers['X-HW-Cache-CRC'], } if file_info != cache_data.get(cache_file_key, {}): if not progress_bar: logger.info("Downloading: {item_title}/{url_filename}" .format(item_title=item_title, url_filename=url_filename)) with open(filename, 'wb') as outfile: total_length = item_r.headers.get('content-length') if total_length is None: # no content length header outfile.write(item_r.content) else: dl = 0 total_length = int(total_length) for data in item_r.iter_content(chunk_size=4096): # noqa E501 dl += len(data) outfile.write(data) pb_width = 50 done = int(pb_width * dl / total_length) if progress_bar: print("Downloading: {item_title}/{url_filename}: {percent}% [{filler}{space}]" # noqa E501 .format(item_title=item_title, url_filename=url_filename, percent=int(done * (100 / pb_width)), # noqa E501 filler='=' * done, space=' ' * (pb_width - done), # noqa E501 ), end='\r') if progress_bar: # print new line so next progress bar # is on its own line print() cache_data[cache_file_key] = file_info # Update cache file with newest data so if the script # quits it can keep track of the progress with open(cache_file, 'w') as outfile: json.dump( cache_data, outfile, sort_keys=True, indent=4, )