thingy_grabber.py 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895
  1. #!/usr/bin/env python3
  2. """
  3. Thingiverse bulk downloader
  4. """
  5. import re
  6. import sys
  7. import os
  8. import argparse
  9. import unicodedata
  10. import requests
  11. import logging
  12. import multiprocessing
  13. import enum
  14. import datetime
  15. from shutil import copyfile
  16. from dataclasses import dataclass
  17. import py7zr
  18. import glob
  19. import shutil
  20. from io import StringIO
  21. from html.parser import HTMLParser
  22. SEVENZIP_FILTERS = [{'id': py7zr.FILTER_LZMA2}]
  23. # I don't think this is exported by datetime
  24. DEFAULT_DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S'
  25. # Windows cannot handle : in filenames
  26. SAFE_DATETIME_FORMAT = '%Y-%m-%d %H.%M.%S'
  27. API_BASE = "https://api.thingiverse.com"
  28. ACCESS_QP = "access_token={}"
  29. PAGE_QP = "page={}"
  30. API_USER_DESIGNS = API_BASE + "/users/{}/things/?" + ACCESS_QP
  31. API_USER_COLLECTIONS = API_BASE + "/users/{}/collections/all?" + ACCESS_QP
  32. # Currently useless as it gives the same info as the matching element in API_USER_COLLECTIONS
  33. API_COLLECTION = API_BASE + "/collections/{}/?" + ACCESS_QP
  34. API_COLLECTION_THINGS = API_BASE + "/collections/{}/things/?" + ACCESS_QP
  35. API_THING_DETAILS = API_BASE + "/things/{}/?" + ACCESS_QP
  36. API_THING_FILES = API_BASE + "/things/{}/files/?" + ACCESS_QP
  37. API_THING_IMAGES = API_BASE + "/things/{}/images/?" + ACCESS_QP
  38. API_THING_DOWNLOAD = "/download/?" + ACCESS_QP
  39. DOWNLOADER_COUNT = 1
  40. RETRY_COUNT = 3
  41. MAX_PATH_LENGTH = 250
  42. VERSION = "0.10.5"
  43. TIMESTAMP_FILE = "timestamp.txt"
  44. SESSION = requests.Session()
  45. class MLStripper(HTMLParser):
  46. """ Turns HTML markup into plain text
  47. """
  48. def error(self, message):
  49. raise ValueError(message)
  50. def __init__(self):
  51. super().__init__()
  52. self.reset()
  53. self.strict = False
  54. self.convert_charrefs = True
  55. self.text = StringIO()
  56. def handle_data(self, d):
  57. self.text.write(d)
  58. def get_data(self):
  59. return self.text.getvalue()
  60. @staticmethod
  61. def strip_tags(html):
  62. s = MLStripper()
  63. s.feed(html)
  64. return s.get_data()
  65. @dataclass
  66. class ThingLink:
  67. thing_id: str
  68. name: str
  69. api_link: str
  70. @dataclass
  71. class FileLink:
  72. name: str
  73. last_update: datetime.datetime
  74. link: str
  75. @dataclass
  76. class ImageLink:
  77. name: str
  78. link: str
  79. class FileLinks:
  80. def __init__(self, initial_links=None):
  81. if initial_links is None:
  82. initial_links = []
  83. self.links = []
  84. self.last_update = None
  85. for link in initial_links:
  86. self.append(link)
  87. def __iter__(self):
  88. return iter(self.links)
  89. def __getitem__(self, item):
  90. return self.links[item]
  91. def __len__(self):
  92. return len(self.links)
  93. def append(self, link):
  94. try:
  95. self.last_update = max(self.last_update, link.last_update)
  96. except TypeError:
  97. self.last_update = link.last_update
  98. self.links.append(link)
  99. class State(enum.Enum):
  100. OK = enum.auto()
  101. FAILED = enum.auto()
  102. ALREADY_DOWNLOADED = enum.auto()
  103. def sanitise_url(url):
  104. """ remove api keys from an url
  105. """
  106. return re.sub(r'access_token=\w*',
  107. 'access_token=***',
  108. url)
  109. def strip_time(date_obj):
  110. """ Takes a datetime object and returns another with the time set to 00:00
  111. """
  112. return datetime.datetime.combine(date_obj.date(), datetime.time())
  113. def rename_unique(dir_name, target_dir_name):
  114. """ Move a directory sideways to a new name, ensuring it is unique.
  115. """
  116. target_dir = target_dir_name
  117. inc = 0
  118. while os.path.exists(target_dir):
  119. target_dir = "{}_{}".format(target_dir_name, inc)
  120. inc += 1
  121. os.rename(dir_name, target_dir)
  122. return target_dir
  123. def fail_dir(dir_name):
  124. """ When a download has failed, move it sideways.
  125. """
  126. return rename_unique(dir_name, "{}_failed".format(dir_name))
  127. def truncate_name(file_name):
  128. """ Ensure the filename is not too long for, well windows basically.
  129. """
  130. path = os.path.abspath(file_name)
  131. if len(path) <= MAX_PATH_LENGTH:
  132. return path
  133. base, extension = os.path.splitext(path)
  134. inc = 0
  135. new_path = "{}_{}{}".format(base, inc, extension)
  136. while os.path.exists(new_path):
  137. new_path = "{}_{}{}".format(base, inc, extension)
  138. inc += 1
  139. return new_path
  140. def slugify(value):
  141. """
  142. Normalise string, removes invalid for filename charactersr
  143. and converts string to lowercase.
  144. """
  145. logging.debug("Sluggyfying {}".format(value))
  146. value = unicodedata.normalize('NFKC', value).lower().strip()
  147. value = re.sub(r'[\\/<>:?*|"]', '', value)
  148. value = re.sub(r'\.*$', '', value)
  149. return value.strip()
  150. class Downloader(multiprocessing.Process):
  151. """
  152. Class to handle downloading the things we have found to get.
  153. """
  154. def __init__(self, thing_queue, download_directory, compress, api_key):
  155. multiprocessing.Process.__init__(self)
  156. # TODO: add parameters
  157. self.thing_queue = thing_queue
  158. self.download_directory = download_directory
  159. self.compress = compress
  160. self.api_key = api_key
  161. def run(self):
  162. """ actual download loop.
  163. """
  164. while True:
  165. thing_id = self.thing_queue.get()
  166. if thing_id is None:
  167. logging.info("Shutting download queue")
  168. self.thing_queue.task_done()
  169. break
  170. thing = None
  171. if isinstance(thing_id, str):
  172. thing = Thing.from_thing_id(thing_id)
  173. if isinstance(thing_id, ThingLink):
  174. thing = Thing(thing_id)
  175. if not thing:
  176. logging.error("Don't know how to handle thing_id {}".format(thing_id))
  177. else:
  178. logging.info("Handling id {}".format(thing_id))
  179. thing.download(self.download_directory, self.compress, self.api_key)
  180. self.thing_queue.task_done()
  181. return
  182. class Grouping:
  183. """ Holds details of a group of things for download
  184. This is effectively (although not actually) an abstract class
  185. - use Collection or Designs instead.
  186. """
  187. def __init__(self, quick, compress, api_key):
  188. self.things = []
  189. self.total = 0
  190. self.req_id = None
  191. self.last_page = 0
  192. self.per_page = None
  193. # Should we stop downloading when we hit a known datestamp?
  194. self.quick = quick
  195. self.compress = compress
  196. self.api_key = api_key
  197. # These should be set by child classes.
  198. self.url = None
  199. self.download_dir = None
  200. @property
  201. def get(self):
  202. """ retrieve the things of the grouping. """
  203. if self.things:
  204. # We've already done it.
  205. return self.things
  206. # Check for initialisation:
  207. if not self.url:
  208. logging.error("No URL set - object not initialised properly?")
  209. raise ValueError("No URL set - object not initialised properly?")
  210. # Get the internal details of the grouping.
  211. logging.debug("Querying {}".format(sanitise_url(self.url)))
  212. # follow next links until all items are found
  213. current_url = self.url
  214. while current_url != None:
  215. logging.info("requesting:{}".format(sanitise_url(current_url)))
  216. current_req = SESSION.get(current_url)
  217. current_url = current_req.links.get('next', {}).get('url')
  218. if current_req.status_code != 200:
  219. logging.error(
  220. "Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(self.url),
  221. current_req.text))
  222. else:
  223. current_json = current_req.json()
  224. for thing in current_json:
  225. logging.debug(thing)
  226. self.things.append(ThingLink(thing['id'], thing['name'], thing['url']))
  227. logging.info("Found {} things.".format(len(self.things)))
  228. return self.things
  229. def download(self):
  230. """ Downloads all the files in a collection """
  231. if not self.things:
  232. self.get
  233. if not self.download_dir:
  234. raise ValueError(
  235. "No download_dir set - invalidly initialised object?")
  236. try:
  237. os.mkdir(self.download_dir)
  238. except FileExistsError:
  239. logging.info("Target directory {} already exists. Assuming a resume."
  240. .format(self.download_dir))
  241. logging.info("Downloading {} thing(s).".format(self.total))
  242. for idx, thing in enumerate(self.things):
  243. logging.info("Downloading thing {} - {}".format(idx, thing))
  244. return_code = Thing(thing).download(self.download_dir, self.compress, self.api_key)
  245. if self.quick and return_code == State.ALREADY_DOWNLOADED:
  246. logging.info("Caught up, stopping.")
  247. return
  248. class Collection(Grouping):
  249. """ Holds details of a collection. """
  250. def __init__(self, user, name, directory, quick, compress, api_key):
  251. Grouping.__init__(self, quick, compress, api_key)
  252. self.user = user
  253. self.name = name
  254. self.paginated = False
  255. # need to figure out the the ID for the collection
  256. collection_url = API_USER_COLLECTIONS.format(user, api_key)
  257. try:
  258. current_req = SESSION.get(collection_url)
  259. except requests.exceptions.ConnectionError as error:
  260. logging.error("Unable to connect for collections for user {}: {}".format(
  261. self.user, error))
  262. return
  263. if current_req.status_code != 200:
  264. logging.error(
  265. "Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(collection_url),
  266. current_req.text))
  267. return
  268. collection_list = current_req.json()
  269. try:
  270. # case insensitive to retain parity with previous behaviour
  271. collection = [x for x in collection_list if x['name'].casefold() == name.casefold()][0]
  272. except IndexError:
  273. logging.error("Unable to find collection {} for user {}".format(name, user))
  274. return
  275. self.collection_id = collection['id']
  276. self.url = API_COLLECTION_THINGS.format(self.collection_id, api_key)
  277. self.download_dir = os.path.join(directory,
  278. "{}-{}".format(slugify(self.user), slugify(self.name)))
  279. class Designs(Grouping):
  280. """ Holds details of all of a users' designs. """
  281. def __init__(self, user, directory, quick, compress, api_key):
  282. Grouping.__init__(self, quick, compress, api_key)
  283. self.user = user
  284. self.url = API_USER_DESIGNS.format(user, api_key)
  285. self.download_dir = os.path.join(
  286. directory, "{} designs".format(slugify(self.user)))
  287. class Thing:
  288. """ An individual design on thingiverse. """
  289. def __init__(self, thing_link):
  290. self.thing_id = thing_link.thing_id
  291. self.name = thing_link.name
  292. self.last_time = None
  293. self._parsed = False
  294. self._needs_download = True
  295. self.text = None
  296. self.download_dir = None
  297. self.time_stamp = None
  298. self._file_links = FileLinks()
  299. self._image_links = []
  300. @classmethod
  301. def from_thing_id(cls, thing_id):
  302. """
  303. Factory method that looks up a thing by ID and creates a Thing object for it
  304. :param thing_id: to look up
  305. :return: Thing or None
  306. """
  307. return Thing(ThingLink(thing_id, "", ""))
  308. def _parse(self, base_dir, api_key):
  309. """ Work out what, if anything needs to be done. """
  310. if self._parsed:
  311. return
  312. # First get the broad details
  313. url = API_THING_DETAILS.format(self.thing_id, api_key)
  314. try:
  315. current_req = SESSION.get(url)
  316. except requests.exceptions.ConnectionError as error:
  317. logging.error("Unable to connect for thing {}: {}".format(
  318. self.thing_id, error))
  319. return
  320. # Check for DMCA
  321. if current_req.status_code == 403:
  322. logging.error("Access to thing {} is forbidden".format(self.thing_id))
  323. return
  324. if current_req.status_code != 200:
  325. logging.error("Got unexpected code {} from url {}: {}".format(current_req.status_code, sanitise_url(url),
  326. current_req.text))
  327. return
  328. thing_json = current_req.json()
  329. try:
  330. self._license = thing_json['license']
  331. except KeyError:
  332. logging.warning("No license found for thing {}?".format(self.thing_id))
  333. details = None
  334. try:
  335. details = thing_json['details']
  336. except KeyError:
  337. logging.warning("No description found for thing {}?".format(self.thing_id))
  338. if details:
  339. try:
  340. self._details = MLStripper.strip_tags(details)
  341. except ValueError as e:
  342. logging.warning("Unable to strip HTML from readme: {}".format(e))
  343. self._details = details
  344. if not self.name:
  345. # Probably generated with factory method.
  346. try:
  347. self.name = thing_json['name']
  348. except KeyError:
  349. logging.warning("No name found for thing {}?".format(self.thing_id))
  350. self.name = self.thing_id
  351. # Now get the file details
  352. file_url = API_THING_FILES.format(self.thing_id, api_key)
  353. try:
  354. current_req = SESSION.get(file_url)
  355. except requests.exceptions.ConnectionError as error:
  356. logging.error("Unable to connect for thing {}: {}".format(
  357. self.thing_id, error))
  358. return
  359. if current_req.status_code != 200:
  360. logging.error("Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(file_url),
  361. current_req.text))
  362. return
  363. link_list = current_req.json()
  364. if not link_list:
  365. logging.error("No files found for thing {} - probably thingiverse being broken, try again later".format(
  366. self.thing_id))
  367. for link in link_list:
  368. logging.debug("Parsing link: {}".format(sanitise_url(link['url'])))
  369. try:
  370. datestamp = datetime.datetime.strptime(link['date'], DEFAULT_DATETIME_FORMAT)
  371. self._file_links.append(
  372. FileLink(link['name'], datestamp, link['url'] + API_THING_DOWNLOAD.format(api_key)))
  373. except ValueError:
  374. logging.error(link['date'])
  375. # Finally get the image links
  376. image_url = API_THING_IMAGES.format(self.thing_id, api_key)
  377. try:
  378. current_req = SESSION.get(image_url)
  379. except requests.exceptions.ConnectionError as error:
  380. logging.error("Unable to connect for thing {}: {}".format(
  381. self.thing_id, error))
  382. return
  383. if current_req.status_code != 200:
  384. logging.error(
  385. "Unexpected status code {} for {}: {}".format(current_req.status_code, sanitise_url(image_url),
  386. current_req.text))
  387. return
  388. image_list = current_req.json()
  389. if not image_list:
  390. logging.warning(
  391. "No images found for thing {} - probably thingiverse being iffy as this seems unlikely".format(
  392. self.thing_id))
  393. for image in image_list:
  394. logging.debug("parsing image: {}".format(image))
  395. name = None
  396. try:
  397. name = slugify(image['name'])
  398. # TODO: fallback to other types
  399. url = [x for x in image['sizes'] if x['type'] == 'display' and x['size'] == 'large'][0]['url']
  400. except KeyError:
  401. logging.warning("Missing image for {}".format(name))
  402. self._image_links.append(ImageLink(name, url))
  403. self.slug = "{} - {}".format(self.thing_id, slugify(self.name))
  404. self.download_dir = os.path.join(base_dir, self.slug)
  405. self._handle_old_directory(base_dir)
  406. logging.debug("Parsing {} ({})".format(self.thing_id, self.name))
  407. latest, self.last_time = self._find_last_download(base_dir)
  408. if not latest:
  409. # Not yet downloaded
  410. self._parsed = True
  411. return
  412. logging.info("last downloaded version: {}".format(self.last_time))
  413. # OK, so we have a timestamp, lets see if there is anything new to get
  414. # First off, are we comparing an old download that threw away the timestamp?
  415. ignore_time = self.last_time == strip_time(self.last_time)
  416. try:
  417. # TODO: Allow for comparison at the exact time
  418. files_last_update = self._file_links.last_update
  419. if ignore_time:
  420. logging.info("Dropping time from comparison stamp as old-style download dir")
  421. files_last_update = strip_time(files_last_update)
  422. if files_last_update > self.last_time:
  423. logging.info(
  424. "Found new/updated files {}".format(self._file_links.last_update))
  425. self._needs_download = True
  426. self._parsed = True
  427. return
  428. except TypeError:
  429. logging.warning("No files found for {}.".format(self.thing_id))
  430. # Got here, so nope, no new files.
  431. self._needs_download = False
  432. self._parsed = True
  433. def _handle_old_directory(self, base_dir):
  434. """ Deal with any old directories from previous versions of the code.
  435. """
  436. old_dir = os.path.join(base_dir, slugify(self.name))
  437. if os.path.exists(old_dir):
  438. logging.warning("Found old style download_dir. Moving.")
  439. rename_unique(old_dir, self.download_dir)
  440. def _handle_outdated_directory(self):
  441. """ Move the current download directory sideways if the thing has changed.
  442. """
  443. if not os.path.exists(self.download_dir):
  444. # No old directory to move.
  445. return None
  446. timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE)
  447. if not os.path.exists(timestamp_file):
  448. # Old form of download directory
  449. target_dir_name = "{} - old".format(self.download_dir)
  450. else:
  451. target_dir_name = "{} - {}".format(self.download_dir, self.last_time.strftime(SAFE_DATETIME_FORMAT))
  452. return rename_unique(self.download_dir, target_dir_name)
  453. def _find_last_download(self, base_dir):
  454. """ Look for the most recent previous download (if any) of the thing.
  455. """
  456. logging.info("Looking for old things")
  457. # First the DL directory itself.
  458. timestamp_file = os.path.join(self.download_dir, TIMESTAMP_FILE)
  459. latest = None
  460. latest_time = None
  461. try:
  462. logging.debug("Checking for existing download in normal place.")
  463. with open(timestamp_file) as ts_fh:
  464. timestamp_text = ts_fh.read().strip()
  465. latest_time = datetime.datetime.strptime(timestamp_text, DEFAULT_DATETIME_FORMAT)
  466. latest = self.download_dir
  467. except FileNotFoundError:
  468. # No existing download directory. huh.
  469. pass
  470. except TypeError:
  471. logging.warning("Invalid timestamp file found in {}".format(self.download_dir))
  472. # TODO: Maybe look for old download directories.
  473. # Now look for 7z files
  474. candidates = glob.glob(os.path.join(base_dir, "{}*.7z".format(self.thing_id)))
  475. # +3 to allow for ' - '
  476. leading_length = len(self.slug) + 3
  477. for path in candidates:
  478. candidate = os.path.basename(path)
  479. try:
  480. logging.debug("Examining '{}' - '{}'".format(candidate, candidate[leading_length:-3]))
  481. candidate_time = datetime.datetime.strptime(candidate[leading_length:-3], SAFE_DATETIME_FORMAT)
  482. except ValueError:
  483. logging.warning("There was an error finding the date in {}. Ignoring.".format(candidate))
  484. continue
  485. try:
  486. if candidate_time > latest_time:
  487. latest_time = candidate_time
  488. latest = candidate
  489. except TypeError:
  490. latest_time = candidate_time
  491. latest = candidate
  492. logging.info("Found last old thing: {} / {}".format(latest, latest_time))
  493. return latest, latest_time
  494. def download(self, base_dir, compress, api_key):
  495. """ Download all files for a given thing.
  496. Returns True iff the thing is now downloaded (not iff it downloads the thing!)
  497. """
  498. if not self._parsed:
  499. self._parse(base_dir, api_key)
  500. if not self._parsed:
  501. logging.error(
  502. "Unable to parse {} - aborting download".format(self.thing_id))
  503. return State.FAILED
  504. if not self._needs_download:
  505. logging.info("{} - {} already downloaded - skipping.".format(self.thing_id, self.name))
  506. return State.ALREADY_DOWNLOADED
  507. if not self._file_links:
  508. logging.error(
  509. "{} - {} appears to have no files. Thingiverse acting up again?".format(self.thing_id, self.name))
  510. return State.FAILED
  511. # Have we already downloaded some things?
  512. renamed_dir = self._handle_outdated_directory()
  513. # Get the list of files to download
  514. new_file_links = []
  515. old_file_links = []
  516. self.time_stamp = None
  517. if not self.last_time:
  518. # If we don't have anything to copy from, then it is all new.
  519. logging.debug("No last time, downloading all files")
  520. new_file_links = self._file_links
  521. self.time_stamp = new_file_links[0].last_update
  522. for file_link in new_file_links:
  523. self.time_stamp = max(self.time_stamp, file_link.last_update)
  524. logging.debug("New timestamp will be {}".format(self.time_stamp))
  525. else:
  526. self.time_stamp = self.last_time
  527. for file_link in self._file_links:
  528. if file_link.last_update > self.last_time:
  529. new_file_links.append(file_link)
  530. self.time_stamp = max(self.time_stamp, file_link.last_update)
  531. else:
  532. old_file_links.append(file_link)
  533. logging.debug("new timestamp {}".format(self.time_stamp))
  534. # OK. Time to get to work.
  535. logging.debug("Generating download_dir")
  536. os.mkdir(self.download_dir)
  537. filelist_file = os.path.join(self.download_dir, "filelist.txt")
  538. with open(filelist_file, 'w', encoding="utf-8") as fl_handle:
  539. for fl in self._file_links:
  540. fl_handle.write("{},{},{}\n".format(fl.link, fl.name, fl.last_update))
  541. # First grab the cached files (if any)
  542. logging.info("Copying {} unchanged files.".format(len(old_file_links)))
  543. if renamed_dir:
  544. for file_link in old_file_links:
  545. try:
  546. old_file = os.path.join(renamed_dir, file_link.name)
  547. new_file = truncate_name(os.path.join(self.download_dir, file_link.name))
  548. logging.debug("Copying {} to {}".format(old_file, new_file))
  549. copyfile(old_file, new_file)
  550. except FileNotFoundError:
  551. logging.warning(
  552. "Unable to find {} in old archive, redownloading".format(file_link.name))
  553. new_file_links.append(file_link)
  554. except TypeError:
  555. # Not altogether sure how this could occur, possibly with some combination of the old file types
  556. logging.warning(
  557. "Typeerror looking for {} in {}".format(file_link.name, renamed_dir))
  558. new_file_links.append(file_link)
  559. # Now download the new ones
  560. logging.info("Downloading {} new files of {}".format(
  561. len(new_file_links), len(self._file_links)))
  562. try:
  563. for file_link in new_file_links:
  564. file_name = truncate_name(os.path.join(self.download_dir, file_link.name))
  565. logging.debug("Downloading {} from {} to {}".format(
  566. file_link.name, file_link.link, file_name))
  567. data_req = SESSION.get(file_link.link)
  568. if data_req.status_code != 200:
  569. logging.error("Unexpected status code {} for {}".format(data_req.status_code,
  570. sanitise_url(file_link.link)))
  571. logging.debug("Unexpected status code {} for {}: {}".format(data_req.status_code,
  572. sanitise_url(file_link.link),
  573. data_req.text))
  574. fail_dir(self.download_dir)
  575. return State.FAILED
  576. with open(file_name, 'wb') as handle:
  577. handle.write(data_req.content)
  578. except Exception as exception:
  579. logging.error("Failed to download {} - {}".format(file_link.name, exception))
  580. fail_dir(self.download_dir)
  581. return State.FAILED
  582. # People like images.
  583. image_dir = os.path.join(self.download_dir, 'images')
  584. logging.info("Downloading {} images.".format(len(self._image_links)))
  585. try:
  586. os.mkdir(image_dir)
  587. for imagelink in self._image_links:
  588. filename = os.path.join(image_dir, imagelink.name)
  589. image_req = SESSION.get(imagelink.link)
  590. if image_req.status_code != 200:
  591. logging.error("Unexpected status code {} for {}: {}".format(image_req.status_code,
  592. sanitise_url(imagelink.link),
  593. image_req.text))
  594. fail_dir(self.download_dir)
  595. return State.FAILED
  596. with open(truncate_name(filename), 'wb') as handle:
  597. handle.write(image_req.content)
  598. except Exception as exception:
  599. logging.error("Failed to download {} - {}".format(imagelink.name, exception))
  600. fail_dir(self.download_dir)
  601. return State.FAILED
  602. # Best get some licenses
  603. logging.info("writing license file")
  604. try:
  605. if self._license:
  606. with open(truncate_name(os.path.join(self.download_dir, 'license.txt')), 'w',
  607. encoding="utf-8") as license_handle:
  608. license_handle.write("{}\n".format(self._license))
  609. except IOError as exception:
  610. logging.warning("Failed to write license! {}".format(exception))
  611. logging.info("writing readme")
  612. try:
  613. if self._details:
  614. with open(truncate_name(os.path.join(self.download_dir, 'readme.txt')), 'w',
  615. encoding="utf-8") as readme_handle:
  616. readme_handle.write("{}\n".format(self._details))
  617. except IOError as exception:
  618. logging.warning("Failed to write readme! {}".format(exception))
  619. try:
  620. # Now write the timestamp
  621. with open(os.path.join(self.download_dir, TIMESTAMP_FILE), 'w', encoding="utf-8") as timestamp_handle:
  622. timestamp_handle.write(self.time_stamp.__str__())
  623. except Exception as exception:
  624. logging.error("Failed to write timestamp file - {}".format(exception))
  625. fail_dir(self.download_dir)
  626. return State.FAILED
  627. self._needs_download = False
  628. logging.debug("Download of {} finished".format(self.name))
  629. if not compress:
  630. return State.OK
  631. thing_dir = "{} - {} - {}".format(self.thing_id,
  632. slugify(self.name),
  633. self.time_stamp.strftime(SAFE_DATETIME_FORMAT))
  634. file_name = os.path.join(base_dir,
  635. "{}.7z".format(thing_dir))
  636. logging.debug("Compressing {} to {}".format(
  637. self.name,
  638. file_name))
  639. with py7zr.SevenZipFile(file_name, 'w', filters=SEVENZIP_FILTERS) as archive:
  640. archive.writeall(self.download_dir, thing_dir)
  641. logging.debug("Compression of {} finished.".format(self.name))
  642. shutil.rmtree(self.download_dir)
  643. logging.debug("Removed temporary download dir of {}.".format(self.name))
  644. return State.OK
  645. def do_batch(batch_file, download_dir, quick, compress, api_key):
  646. """ Read a file in line by line, parsing each as a set of calls to this script."""
  647. with open(batch_file) as handle:
  648. for line in handle:
  649. line = line.strip()
  650. if not line:
  651. # Skip empty lines
  652. continue
  653. logging.info("Handling instruction {}".format(line))
  654. command_arr = line.split()
  655. if command_arr[0] == "thing":
  656. logging.debug(
  657. "Handling batch thing instruction: {}".format(line))
  658. Thing.from_thing_id(command_arr[1]).download(download_dir, compress, api_key)
  659. continue
  660. if command_arr[0] == "collection":
  661. logging.debug(
  662. "Handling batch collection instruction: {}".format(line))
  663. Collection(command_arr[1], command_arr[2],
  664. download_dir, quick, compress, api_key).download()
  665. continue
  666. if command_arr[0] == "user":
  667. logging.debug(
  668. "Handling batch collection instruction: {}".format(line))
  669. Designs(command_arr[1], download_dir, quick, compress, api_key).download()
  670. continue
  671. logging.warning("Unable to parse current instruction. Skipping.")
  672. def main():
  673. """ Entry point for script being run as a command. """
  674. parser = argparse.ArgumentParser()
  675. parser.add_argument("-l", "--log-level", choices=[
  676. 'debug', 'info', 'warning'], default='info', help="level of logging desired")
  677. parser.add_argument("-d", "--directory",
  678. help="Target directory to download into")
  679. parser.add_argument("-f", "--log-file",
  680. help="Place to log debug information to")
  681. parser.add_argument("-q", "--quick", action="store_true",
  682. help="Assume date ordering on posts")
  683. parser.add_argument("-c", "--compress", action="store_true",
  684. help="Compress files")
  685. parser.add_argument("-a", "--api-key",
  686. help="API key for thingiverse")
  687. subparsers = parser.add_subparsers(
  688. help="Type of thing to download", dest="subcommand")
  689. collection_parser = subparsers.add_parser(
  690. 'collection', help="Download one or more entire collection(s)")
  691. collection_parser.add_argument(
  692. "owner", help="The owner of the collection(s) to get")
  693. collection_parser.add_argument(
  694. "collections", nargs="+", help="Space seperated list of the name(s) of collection to get")
  695. thing_parser = subparsers.add_parser(
  696. 'thing', help="Download a single thing.")
  697. thing_parser.add_argument(
  698. "things", nargs="*", help="Space seperated list of thing ID(s) to download")
  699. user_parser = subparsers.add_parser(
  700. "user", help="Download all things by one or more users")
  701. user_parser.add_argument(
  702. "users", nargs="+", help="A space seperated list of the user(s) to get the designs of")
  703. batch_parser = subparsers.add_parser(
  704. "batch", help="Perform multiple actions written in a text file")
  705. batch_parser.add_argument(
  706. "batch_file", help="The name of the file to read.")
  707. subparsers.add_parser("version", help="Show the current version")
  708. args = parser.parse_args()
  709. if not args.subcommand:
  710. parser.print_help()
  711. sys.exit(1)
  712. if not args.directory:
  713. args.directory = os.getcwd()
  714. logger = logging.getLogger()
  715. formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  716. logger.setLevel(logging.DEBUG)
  717. console_handler = logging.StreamHandler()
  718. console_handler.setLevel(args.log_level.upper())
  719. if args.api_key:
  720. api_key = args.api_key
  721. else:
  722. try:
  723. with open("api.key") as fh:
  724. api_key = fh.read().strip()
  725. except Exception as e:
  726. logging.error("Either specify the api-key on the command line or in a file called 'api.key'")
  727. logging.error("Exception: {}".format(e))
  728. return
  729. logger.addHandler(console_handler)
  730. if args.log_file:
  731. file_handler = logging.FileHandler(args.log_file)
  732. file_handler.setLevel(logging.DEBUG)
  733. file_handler.setFormatter(formatter)
  734. logger.addHandler(file_handler)
  735. # Start downloader
  736. thing_queue = multiprocessing.JoinableQueue()
  737. logging.debug("starting {} downloader(s)".format(DOWNLOADER_COUNT))
  738. downloaders = [Downloader(thing_queue, args.directory, args.compress, api_key) for _ in range(DOWNLOADER_COUNT)]
  739. for downloader in downloaders:
  740. downloader.start()
  741. if args.subcommand.startswith("collection"):
  742. for collection in args.collections:
  743. Collection(args.owner, collection, args.directory, args.quick, args.compress, api_key).download()
  744. if args.subcommand == "thing":
  745. for thing in args.things:
  746. thing_queue.put(thing)
  747. if args.subcommand == "user":
  748. for user in args.users:
  749. Designs(user, args.directory, args.quick, args.compress, api_key).download()
  750. if args.subcommand == "version":
  751. print("thingy_grabber.py version {}".format(VERSION))
  752. if args.subcommand == "batch":
  753. do_batch(args.batch_file, args.directory, args.quick, args.compress, api_key)
  754. # Stop the downloader processes
  755. for _ in downloaders:
  756. thing_queue.put(None)
  757. if __name__ == "__main__":
  758. multiprocessing.freeze_support()
  759. main()