123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440 |
- #!/usr/bin/env python3
- # Contest Management System - http://cms-dev.github.io/
- # Copyright © 2010-2012 Giovanni Mascellani <mascellani@poisson.phc.unipi.it>
- # Copyright © 2010-2018 Stefano Maggiolo <s.maggiolo@gmail.com>
- # Copyright © 2010-2012 Matteo Boscariol <boscarim@hotmail.com>
- # Copyright © 2013-2018 Luca Wehrstedt <luca.wehrstedt@gmail.com>
- # Copyright © 2014 Luca Versari <veluca93@gmail.com>
- #
- # This program is free software: you can redistribute it and/or modify
- # it under the terms of the GNU Affero General Public License as
- # published by the Free Software Foundation, either version 3 of the
- # License, or (at your option) any later version.
- #
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU Affero General Public License for more details.
- #
- # You should have received a copy of the GNU Affero General Public License
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
- """This service exports every data that CMS knows. The process of
- exporting and importing again should be idempotent.
- """
- # We enable monkey patching to make many libraries gevent-friendly
- # (for instance, urllib3, used by requests)
- import gevent.monkey
- gevent.monkey.patch_all() # noqa
- import argparse
- import json
- import logging
- import os
- import sys
- import tarfile
- import tempfile
- from datetime import date
- from sqlalchemy.types import \
- Boolean, Integer, Float, String, Unicode, DateTime, Interval, Enum
- from sqlalchemy.dialects.postgresql import ARRAY, CIDR, JSONB
- from cms import rmtree, utf8_decoder
- from cms.db import version as model_version, Codename, Filename, \
- FilenameSchema, FilenameSchemaArray, Digest, SessionGen, Contest, User, \
- Task, Submission, UserTest, SubmissionResult, UserTestResult, PrintJob, \
- Announcement, Participation, enumerate_files
- from cms.db.filecacher import FileCacher
- from cmscommon.datetime import make_timestamp
- from cmscommon.digest import path_digest
- logger = logging.getLogger(__name__)
- def get_archive_info(file_name):
- """Return information about the archive name.
- file_name (string): the file name of the archive to analyze.
- return (dict): dictionary containing the following keys:
- "basename", "extension", "write_mode"
- """
- # TODO - This method doesn't seem to be a masterpiece in terms of
- # cleanness...
- ret = {"basename": "",
- "extension": "",
- "write_mode": "",
- }
- if not (file_name.endswith(".tar.gz")
- or file_name.endswith(".tar.bz2")
- or file_name.endswith(".tar")
- or file_name.endswith(".zip")):
- return ret
- if file_name.endswith(".tar"):
- ret["basename"] = os.path.basename(file_name[:-4])
- ret["extension"] = "tar"
- ret["write_mode"] = "w:"
- elif file_name.endswith(".tar.gz"):
- ret["basename"] = os.path.basename(file_name[:-7])
- ret["extension"] = "tar.gz"
- ret["write_mode"] = "w:gz"
- elif file_name.endswith(".tar.bz2"):
- ret["basename"] = os.path.basename(file_name[:-8])
- ret["extension"] = "tar.bz2"
- ret["write_mode"] = "w:bz2"
- elif file_name.endswith(".zip"):
- ret["basename"] = os.path.basename(file_name[:-4])
- ret["extension"] = "zip"
- ret["write_mode"] = ""
- return ret
- def encode_value(type_, value):
- """Encode a given value of a given type to a JSON-compatible form.
- type_ (sqlalchemy.types.TypeEngine): the SQLAlchemy type of the
- column that held the value.
- value (object): the value.
- return (object): the value, encoded as bool, int, float, string,
- list, dict or any other JSON-compatible format.
- """
- if value is None:
- return None
- elif isinstance(type_, (
- Boolean, Integer, Float, String, Unicode, Enum, JSONB, Codename,
- Filename, FilenameSchema, Digest)):
- return value
- elif isinstance(type_, DateTime):
- return make_timestamp(value)
- elif isinstance(type_, Interval):
- return value.total_seconds()
- elif isinstance(type_, (ARRAY, FilenameSchemaArray)):
- return list(encode_value(type_.item_type, item) for item in value)
- elif isinstance(type_, CIDR):
- return str(value)
- else:
- raise RuntimeError("Unknown SQLAlchemy column type: %s" % type_)
- class DumpExporter:
- """This service exports every data that CMS knows. The process of
- exporting and importing again should be idempotent.
- """
- def __init__(self, contest_ids, export_target,
- dump_files, dump_model, skip_generated,
- skip_submissions, skip_user_tests, skip_users, skip_print_jobs):
- if contest_ids is None:
- with SessionGen() as session:
- contests = session.query(Contest).all()
- self.contests_ids = [contest.id for contest in contests]
- if not skip_users:
- users = session.query(User).all()
- self.users_ids = [user.id for user in users]
- else:
- self.users_ids = []
- tasks = session.query(Task)\
- .filter(Task.contest_id.is_(None)).all()
- self.tasks_ids = [task.id for task in tasks]
- else:
- # FIXME: this is ATM broken, because if you export a contest, you
- # then export the users who participated in it and then all of the
- # contests those users participated in.
- self.contests_ids = contest_ids
- self.users_ids = []
- self.tasks_ids = []
- self.dump_files = dump_files
- self.dump_model = dump_model
- self.skip_generated = skip_generated
- self.skip_submissions = skip_submissions
- self.skip_user_tests = skip_user_tests
- self.skip_users = skip_users
- self.skip_print_jobs = skip_print_jobs
- self.export_target = export_target
- # If target is not provided, we use the contest's name.
- if len(export_target) == 0:
- self.export_target = "dump_%s.tar.gz" % date.today().isoformat()
- logger.warning("export_target not given, using \"%s\"",
- self.export_target)
- self.file_cacher = FileCacher()
- def do_export(self):
- """Run the actual export code."""
- logger.info("Starting export.")
- export_dir = self.export_target
- archive_info = get_archive_info(self.export_target)
- if archive_info["write_mode"] != "":
- # We are able to write to this archive.
- if os.path.exists(self.export_target):
- logger.critical("The specified file already exists, "
- "I won't overwrite it.")
- return False
- export_dir = os.path.join(tempfile.mkdtemp(),
- archive_info["basename"])
- logger.info("Creating dir structure.")
- try:
- os.mkdir(export_dir)
- except OSError:
- logger.critical("The specified directory already exists, "
- "I won't overwrite it.")
- return False
- files_dir = os.path.join(export_dir, "files")
- descr_dir = os.path.join(export_dir, "descriptions")
- os.mkdir(files_dir)
- os.mkdir(descr_dir)
- with SessionGen() as session:
- # Export files.
- logger.info("Exporting files.")
- if self.dump_files:
- for contest_id in self.contests_ids:
- contest = Contest.get_from_id(contest_id, session)
- files = enumerate_files(
- session, contest,
- skip_submissions=self.skip_submissions,
- skip_user_tests=self.skip_user_tests,
- skip_users=self.skip_users,
- skip_print_jobs=self.skip_print_jobs,
- skip_generated=self.skip_generated)
- for file_ in files:
- if not self.safe_get_file(file_,
- os.path.join(files_dir,
- file_),
- os.path.join(descr_dir,
- file_)):
- return False
- # Export data in JSON format.
- if self.dump_model:
- logger.info("Exporting data to a JSON file.")
- # We use strings because they'll be the keys of a JSON
- # object
- self.ids = {}
- self.queue = []
- data = dict()
- for cls, lst in [(Contest, self.contests_ids),
- (User, self.users_ids),
- (Task, self.tasks_ids)]:
- for i in lst:
- obj = cls.get_from_id(i, session)
- self.get_id(obj)
- # Specify the "root" of the data graph
- data["_objects"] = list(self.ids.values())
- while len(self.queue) > 0:
- obj = self.queue.pop(0)
- data[self.ids[obj.sa_identity_key]] = \
- self.export_object(obj)
- data["_version"] = model_version
- destination = os.path.join(export_dir, "contest.json")
- with open(destination, "wt", encoding="utf-8") as fout:
- json.dump(data, fout, indent=4, sort_keys=True)
- # If the admin requested export to file, we do that.
- if archive_info["write_mode"] != "":
- with tarfile.open(self.export_target,
- archive_info["write_mode"]) as archive:
- archive.add(export_dir, arcname=archive_info["basename"])
- rmtree(export_dir)
- logger.info("Export finished.")
- return True
- def get_id(self, obj):
- obj_key = obj.sa_identity_key
- if obj_key not in self.ids:
- # We use strings because they'll be the keys of a JSON object
- self.ids[obj_key] = "%d" % len(self.ids)
- self.queue.append(obj)
- return self.ids[obj_key]
- def export_object(self, obj):
- """Export the given object, returning a JSON-encodable dict.
- The returned dict will contain a "_class" item (the name of the
- class of the given object), an item for each column property
- (with a value properly translated to a JSON-compatible type)
- and an item for each relationship property (which will be an ID
- or a collection of IDs).
- The IDs used in the exported dict aren't related to the ones
- used in the DB: they are newly generated and their scope is
- limited to the exported file only. They are shared among all
- classes (that is, two objects can never share the same ID, even
- if they are of different classes).
- If, when exporting the relationship, we find an object without
- an ID we generate a new ID, assign it to the object and append
- the object to the queue of objects to export.
- The self.skip_submissions flag controls whether we export
- submissions (and all other objects that can be reached only by
- passing through a submission) or not.
- """
- cls = type(obj)
- data = {"_class": cls.__name__}
- for prp in cls._col_props:
- col, = prp.columns
- val = getattr(obj, prp.key)
- data[prp.key] = encode_value(col.type, val)
- for prp in cls._rel_props:
- other_cls = prp.mapper.class_
- # Skip submissions if requested
- if self.skip_submissions and other_cls is Submission:
- continue
- # Skip user_tests if requested
- if self.skip_user_tests and other_cls is UserTest:
- continue
- if self.skip_users:
- skip = False
- # User-related classes reachable from root
- for rel_class in [Participation, Submission, UserTest,
- Announcement]:
- if other_cls is rel_class:
- skip = True
- break
- if skip:
- continue
- # Skip print jobs if requested
- if self.skip_print_jobs and other_cls is PrintJob:
- continue
- # Skip generated data if requested
- if self.skip_generated and other_cls in (SubmissionResult,
- UserTestResult):
- continue
- val = getattr(obj, prp.key)
- if val is None:
- data[prp.key] = None
- elif isinstance(val, other_cls):
- data[prp.key] = self.get_id(val)
- elif isinstance(val, list):
- data[prp.key] = list(self.get_id(i) for i in val)
- elif isinstance(val, dict):
- data[prp.key] = \
- dict((k, self.get_id(v)) for k, v in val.items())
- else:
- raise RuntimeError("Unknown SQLAlchemy relationship type: %s"
- % type(val))
- return data
- def safe_get_file(self, digest, path, descr_path=None):
- """Get file from FileCacher ensuring that the digest is
- correct.
- digest (string): the digest of the file to retrieve.
- path (string): the path where to save the file.
- descr_path (string): the path where to save the description.
- return (bool): True if all ok, False if something wrong.
- """
- # TODO - Probably this method could be merged in FileCacher
- # First get the file
- try:
- self.file_cacher.get_file_to_path(digest, path)
- except Exception:
- logger.error("File %s could not retrieved from file server.",
- digest, exc_info=True)
- return False
- # Then check the digest
- calc_digest = path_digest(path)
- if digest != calc_digest:
- logger.critical("File %s has wrong hash %s.",
- digest, calc_digest)
- return False
- # If applicable, retrieve also the description
- if descr_path is not None:
- with open(descr_path, 'wt', encoding='utf-8') as fout:
- fout.write(self.file_cacher.describe(digest))
- return True
- def main():
- """Parse arguments and launch process."""
- parser = argparse.ArgumentParser(description="Exporter of CMS data.")
- parser.add_argument("-c", "--contest-ids", nargs="+", type=int,
- metavar="contest_id", help="id of contest to export")
- group = parser.add_mutually_exclusive_group()
- group.add_argument("-f", "--files", action="store_true",
- help="only export files, ignore database structure")
- group.add_argument("-F", "--no-files", action="store_true",
- help="only export database structure, ignore files")
- parser.add_argument("-G", "--no-generated", action="store_true",
- help="don't export data and files that can be "
- "automatically generated")
- parser.add_argument("-S", "--no-submissions", action="store_true",
- help="don't export submissions")
- parser.add_argument("-U", "--no-user-tests", action="store_true",
- help="don't export user tests")
- parser.add_argument("-X", "--no-users", action="store_true",
- help="don't export users")
- parser.add_argument("-P", "--no-print-jobs", action="store_true",
- help="don't export print jobs")
- parser.add_argument("export_target", action="store",
- type=utf8_decoder, nargs='?', default="",
- help="target directory or archive for export")
- args = parser.parse_args()
- exporter = DumpExporter(contest_ids=args.contest_ids,
- export_target=args.export_target,
- dump_files=not args.no_files,
- dump_model=not args.files,
- skip_generated=args.no_generated,
- skip_submissions=args.no_submissions,
- skip_user_tests=args.no_user_tests,
- skip_users=args.no_users,
- skip_print_jobs=args.no_print_jobs)
- success = exporter.do_export()
- return 0 if success is True else 1
- if __name__ == "__main__":
- sys.exit(main())
|