DumpExporter.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440
  1. #!/usr/bin/env python3
  2. # Contest Management System - http://cms-dev.github.io/
  3. # Copyright © 2010-2012 Giovanni Mascellani <mascellani@poisson.phc.unipi.it>
  4. # Copyright © 2010-2018 Stefano Maggiolo <s.maggiolo@gmail.com>
  5. # Copyright © 2010-2012 Matteo Boscariol <boscarim@hotmail.com>
  6. # Copyright © 2013-2018 Luca Wehrstedt <luca.wehrstedt@gmail.com>
  7. # Copyright © 2014 Luca Versari <veluca93@gmail.com>
  8. #
  9. # This program is free software: you can redistribute it and/or modify
  10. # it under the terms of the GNU Affero General Public License as
  11. # published by the Free Software Foundation, either version 3 of the
  12. # License, or (at your option) any later version.
  13. #
  14. # This program is distributed in the hope that it will be useful,
  15. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. # GNU Affero General Public License for more details.
  18. #
  19. # You should have received a copy of the GNU Affero General Public License
  20. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  21. """This service exports every data that CMS knows. The process of
  22. exporting and importing again should be idempotent.
  23. """
  24. # We enable monkey patching to make many libraries gevent-friendly
  25. # (for instance, urllib3, used by requests)
  26. import gevent.monkey
  27. gevent.monkey.patch_all() # noqa
  28. import argparse
  29. import json
  30. import logging
  31. import os
  32. import sys
  33. import tarfile
  34. import tempfile
  35. from datetime import date
  36. from sqlalchemy.types import \
  37. Boolean, Integer, Float, String, Unicode, DateTime, Interval, Enum
  38. from sqlalchemy.dialects.postgresql import ARRAY, CIDR, JSONB
  39. from cms import rmtree, utf8_decoder
  40. from cms.db import version as model_version, Codename, Filename, \
  41. FilenameSchema, FilenameSchemaArray, Digest, SessionGen, Contest, User, \
  42. Task, Submission, UserTest, SubmissionResult, UserTestResult, PrintJob, \
  43. Announcement, Participation, enumerate_files
  44. from cms.db.filecacher import FileCacher
  45. from cmscommon.datetime import make_timestamp
  46. from cmscommon.digest import path_digest
  47. logger = logging.getLogger(__name__)
  48. def get_archive_info(file_name):
  49. """Return information about the archive name.
  50. file_name (string): the file name of the archive to analyze.
  51. return (dict): dictionary containing the following keys:
  52. "basename", "extension", "write_mode"
  53. """
  54. # TODO - This method doesn't seem to be a masterpiece in terms of
  55. # cleanness...
  56. ret = {"basename": "",
  57. "extension": "",
  58. "write_mode": "",
  59. }
  60. if not (file_name.endswith(".tar.gz")
  61. or file_name.endswith(".tar.bz2")
  62. or file_name.endswith(".tar")
  63. or file_name.endswith(".zip")):
  64. return ret
  65. if file_name.endswith(".tar"):
  66. ret["basename"] = os.path.basename(file_name[:-4])
  67. ret["extension"] = "tar"
  68. ret["write_mode"] = "w:"
  69. elif file_name.endswith(".tar.gz"):
  70. ret["basename"] = os.path.basename(file_name[:-7])
  71. ret["extension"] = "tar.gz"
  72. ret["write_mode"] = "w:gz"
  73. elif file_name.endswith(".tar.bz2"):
  74. ret["basename"] = os.path.basename(file_name[:-8])
  75. ret["extension"] = "tar.bz2"
  76. ret["write_mode"] = "w:bz2"
  77. elif file_name.endswith(".zip"):
  78. ret["basename"] = os.path.basename(file_name[:-4])
  79. ret["extension"] = "zip"
  80. ret["write_mode"] = ""
  81. return ret
  82. def encode_value(type_, value):
  83. """Encode a given value of a given type to a JSON-compatible form.
  84. type_ (sqlalchemy.types.TypeEngine): the SQLAlchemy type of the
  85. column that held the value.
  86. value (object): the value.
  87. return (object): the value, encoded as bool, int, float, string,
  88. list, dict or any other JSON-compatible format.
  89. """
  90. if value is None:
  91. return None
  92. elif isinstance(type_, (
  93. Boolean, Integer, Float, String, Unicode, Enum, JSONB, Codename,
  94. Filename, FilenameSchema, Digest)):
  95. return value
  96. elif isinstance(type_, DateTime):
  97. return make_timestamp(value)
  98. elif isinstance(type_, Interval):
  99. return value.total_seconds()
  100. elif isinstance(type_, (ARRAY, FilenameSchemaArray)):
  101. return list(encode_value(type_.item_type, item) for item in value)
  102. elif isinstance(type_, CIDR):
  103. return str(value)
  104. else:
  105. raise RuntimeError("Unknown SQLAlchemy column type: %s" % type_)
  106. class DumpExporter:
  107. """This service exports every data that CMS knows. The process of
  108. exporting and importing again should be idempotent.
  109. """
  110. def __init__(self, contest_ids, export_target,
  111. dump_files, dump_model, skip_generated,
  112. skip_submissions, skip_user_tests, skip_users, skip_print_jobs):
  113. if contest_ids is None:
  114. with SessionGen() as session:
  115. contests = session.query(Contest).all()
  116. self.contests_ids = [contest.id for contest in contests]
  117. if not skip_users:
  118. users = session.query(User).all()
  119. self.users_ids = [user.id for user in users]
  120. else:
  121. self.users_ids = []
  122. tasks = session.query(Task)\
  123. .filter(Task.contest_id.is_(None)).all()
  124. self.tasks_ids = [task.id for task in tasks]
  125. else:
  126. # FIXME: this is ATM broken, because if you export a contest, you
  127. # then export the users who participated in it and then all of the
  128. # contests those users participated in.
  129. self.contests_ids = contest_ids
  130. self.users_ids = []
  131. self.tasks_ids = []
  132. self.dump_files = dump_files
  133. self.dump_model = dump_model
  134. self.skip_generated = skip_generated
  135. self.skip_submissions = skip_submissions
  136. self.skip_user_tests = skip_user_tests
  137. self.skip_users = skip_users
  138. self.skip_print_jobs = skip_print_jobs
  139. self.export_target = export_target
  140. # If target is not provided, we use the contest's name.
  141. if len(export_target) == 0:
  142. self.export_target = "dump_%s.tar.gz" % date.today().isoformat()
  143. logger.warning("export_target not given, using \"%s\"",
  144. self.export_target)
  145. self.file_cacher = FileCacher()
  146. def do_export(self):
  147. """Run the actual export code."""
  148. logger.info("Starting export.")
  149. export_dir = self.export_target
  150. archive_info = get_archive_info(self.export_target)
  151. if archive_info["write_mode"] != "":
  152. # We are able to write to this archive.
  153. if os.path.exists(self.export_target):
  154. logger.critical("The specified file already exists, "
  155. "I won't overwrite it.")
  156. return False
  157. export_dir = os.path.join(tempfile.mkdtemp(),
  158. archive_info["basename"])
  159. logger.info("Creating dir structure.")
  160. try:
  161. os.mkdir(export_dir)
  162. except OSError:
  163. logger.critical("The specified directory already exists, "
  164. "I won't overwrite it.")
  165. return False
  166. files_dir = os.path.join(export_dir, "files")
  167. descr_dir = os.path.join(export_dir, "descriptions")
  168. os.mkdir(files_dir)
  169. os.mkdir(descr_dir)
  170. with SessionGen() as session:
  171. # Export files.
  172. logger.info("Exporting files.")
  173. if self.dump_files:
  174. for contest_id in self.contests_ids:
  175. contest = Contest.get_from_id(contest_id, session)
  176. files = enumerate_files(
  177. session, contest,
  178. skip_submissions=self.skip_submissions,
  179. skip_user_tests=self.skip_user_tests,
  180. skip_users=self.skip_users,
  181. skip_print_jobs=self.skip_print_jobs,
  182. skip_generated=self.skip_generated)
  183. for file_ in files:
  184. if not self.safe_get_file(file_,
  185. os.path.join(files_dir,
  186. file_),
  187. os.path.join(descr_dir,
  188. file_)):
  189. return False
  190. # Export data in JSON format.
  191. if self.dump_model:
  192. logger.info("Exporting data to a JSON file.")
  193. # We use strings because they'll be the keys of a JSON
  194. # object
  195. self.ids = {}
  196. self.queue = []
  197. data = dict()
  198. for cls, lst in [(Contest, self.contests_ids),
  199. (User, self.users_ids),
  200. (Task, self.tasks_ids)]:
  201. for i in lst:
  202. obj = cls.get_from_id(i, session)
  203. self.get_id(obj)
  204. # Specify the "root" of the data graph
  205. data["_objects"] = list(self.ids.values())
  206. while len(self.queue) > 0:
  207. obj = self.queue.pop(0)
  208. data[self.ids[obj.sa_identity_key]] = \
  209. self.export_object(obj)
  210. data["_version"] = model_version
  211. destination = os.path.join(export_dir, "contest.json")
  212. with open(destination, "wt", encoding="utf-8") as fout:
  213. json.dump(data, fout, indent=4, sort_keys=True)
  214. # If the admin requested export to file, we do that.
  215. if archive_info["write_mode"] != "":
  216. with tarfile.open(self.export_target,
  217. archive_info["write_mode"]) as archive:
  218. archive.add(export_dir, arcname=archive_info["basename"])
  219. rmtree(export_dir)
  220. logger.info("Export finished.")
  221. return True
  222. def get_id(self, obj):
  223. obj_key = obj.sa_identity_key
  224. if obj_key not in self.ids:
  225. # We use strings because they'll be the keys of a JSON object
  226. self.ids[obj_key] = "%d" % len(self.ids)
  227. self.queue.append(obj)
  228. return self.ids[obj_key]
  229. def export_object(self, obj):
  230. """Export the given object, returning a JSON-encodable dict.
  231. The returned dict will contain a "_class" item (the name of the
  232. class of the given object), an item for each column property
  233. (with a value properly translated to a JSON-compatible type)
  234. and an item for each relationship property (which will be an ID
  235. or a collection of IDs).
  236. The IDs used in the exported dict aren't related to the ones
  237. used in the DB: they are newly generated and their scope is
  238. limited to the exported file only. They are shared among all
  239. classes (that is, two objects can never share the same ID, even
  240. if they are of different classes).
  241. If, when exporting the relationship, we find an object without
  242. an ID we generate a new ID, assign it to the object and append
  243. the object to the queue of objects to export.
  244. The self.skip_submissions flag controls whether we export
  245. submissions (and all other objects that can be reached only by
  246. passing through a submission) or not.
  247. """
  248. cls = type(obj)
  249. data = {"_class": cls.__name__}
  250. for prp in cls._col_props:
  251. col, = prp.columns
  252. val = getattr(obj, prp.key)
  253. data[prp.key] = encode_value(col.type, val)
  254. for prp in cls._rel_props:
  255. other_cls = prp.mapper.class_
  256. # Skip submissions if requested
  257. if self.skip_submissions and other_cls is Submission:
  258. continue
  259. # Skip user_tests if requested
  260. if self.skip_user_tests and other_cls is UserTest:
  261. continue
  262. if self.skip_users:
  263. skip = False
  264. # User-related classes reachable from root
  265. for rel_class in [Participation, Submission, UserTest,
  266. Announcement]:
  267. if other_cls is rel_class:
  268. skip = True
  269. break
  270. if skip:
  271. continue
  272. # Skip print jobs if requested
  273. if self.skip_print_jobs and other_cls is PrintJob:
  274. continue
  275. # Skip generated data if requested
  276. if self.skip_generated and other_cls in (SubmissionResult,
  277. UserTestResult):
  278. continue
  279. val = getattr(obj, prp.key)
  280. if val is None:
  281. data[prp.key] = None
  282. elif isinstance(val, other_cls):
  283. data[prp.key] = self.get_id(val)
  284. elif isinstance(val, list):
  285. data[prp.key] = list(self.get_id(i) for i in val)
  286. elif isinstance(val, dict):
  287. data[prp.key] = \
  288. dict((k, self.get_id(v)) for k, v in val.items())
  289. else:
  290. raise RuntimeError("Unknown SQLAlchemy relationship type: %s"
  291. % type(val))
  292. return data
  293. def safe_get_file(self, digest, path, descr_path=None):
  294. """Get file from FileCacher ensuring that the digest is
  295. correct.
  296. digest (string): the digest of the file to retrieve.
  297. path (string): the path where to save the file.
  298. descr_path (string): the path where to save the description.
  299. return (bool): True if all ok, False if something wrong.
  300. """
  301. # TODO - Probably this method could be merged in FileCacher
  302. # First get the file
  303. try:
  304. self.file_cacher.get_file_to_path(digest, path)
  305. except Exception:
  306. logger.error("File %s could not retrieved from file server.",
  307. digest, exc_info=True)
  308. return False
  309. # Then check the digest
  310. calc_digest = path_digest(path)
  311. if digest != calc_digest:
  312. logger.critical("File %s has wrong hash %s.",
  313. digest, calc_digest)
  314. return False
  315. # If applicable, retrieve also the description
  316. if descr_path is not None:
  317. with open(descr_path, 'wt', encoding='utf-8') as fout:
  318. fout.write(self.file_cacher.describe(digest))
  319. return True
  320. def main():
  321. """Parse arguments and launch process."""
  322. parser = argparse.ArgumentParser(description="Exporter of CMS data.")
  323. parser.add_argument("-c", "--contest-ids", nargs="+", type=int,
  324. metavar="contest_id", help="id of contest to export")
  325. group = parser.add_mutually_exclusive_group()
  326. group.add_argument("-f", "--files", action="store_true",
  327. help="only export files, ignore database structure")
  328. group.add_argument("-F", "--no-files", action="store_true",
  329. help="only export database structure, ignore files")
  330. parser.add_argument("-G", "--no-generated", action="store_true",
  331. help="don't export data and files that can be "
  332. "automatically generated")
  333. parser.add_argument("-S", "--no-submissions", action="store_true",
  334. help="don't export submissions")
  335. parser.add_argument("-U", "--no-user-tests", action="store_true",
  336. help="don't export user tests")
  337. parser.add_argument("-X", "--no-users", action="store_true",
  338. help="don't export users")
  339. parser.add_argument("-P", "--no-print-jobs", action="store_true",
  340. help="don't export print jobs")
  341. parser.add_argument("export_target", action="store",
  342. type=utf8_decoder, nargs='?', default="",
  343. help="target directory or archive for export")
  344. args = parser.parse_args()
  345. exporter = DumpExporter(contest_ids=args.contest_ids,
  346. export_target=args.export_target,
  347. dump_files=not args.no_files,
  348. dump_model=not args.files,
  349. skip_generated=args.no_generated,
  350. skip_submissions=args.no_submissions,
  351. skip_user_tests=args.no_user_tests,
  352. skip_users=args.no_users,
  353. skip_print_jobs=args.no_print_jobs)
  354. success = exporter.do_export()
  355. return 0 if success is True else 1
  356. if __name__ == "__main__":
  357. sys.exit(main())