model.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. import itertools
  2. from builtins import str
  3. from builtins import object
  4. import uuid
  5. from datetime import datetime
  6. from sqlalchemy import Column, MetaData
  7. from sqlalchemy import types
  8. from sqlalchemy.ext.declarative import declarative_base
  9. import ckan.model as model
  10. from ckan.lib import dictization
  11. log = __import__('logging').getLogger(__name__)
  12. Base = declarative_base()
  13. def make_uuid():
  14. return str(uuid.uuid4())
  15. metadata = MetaData()
  16. # enum of all the archival statuses (singleton)
  17. # NB Be very careful changing these status strings. They are also used in
  18. # ckanext-qa tasks.py.
  19. class Status(object):
  20. _instance = None
  21. def __init__(self):
  22. not_broken = {
  23. # is_broken = False
  24. 0: 'Archived successfully',
  25. 1: 'Content has not changed',
  26. }
  27. broken = {
  28. # is_broken = True
  29. 10: 'URL invalid',
  30. 11: 'URL request failed',
  31. 12: 'Download error',
  32. }
  33. not_sure = {
  34. # is_broken = None i.e. not sure
  35. 21: 'Chose not to download',
  36. 22: 'Download failure',
  37. 23: 'System error during archival',
  38. }
  39. self._by_id = dict(itertools.chain(not_broken.items(), broken.items()))
  40. self._by_id.update(not_sure)
  41. self._by_text = dict((value, key)
  42. for key, value in self._by_id.items())
  43. @classmethod
  44. def instance(cls):
  45. if not cls._instance:
  46. cls._instance = cls()
  47. return cls._instance
  48. @classmethod
  49. def by_text(cls, status_txt):
  50. return cls.instance()._by_text[status_txt]
  51. @classmethod
  52. def by_id(cls, status_id):
  53. return cls.instance()._by_id[status_id]
  54. @classmethod
  55. def is_status_broken(cls, status_id):
  56. if status_id < 10:
  57. return False
  58. elif status_id < 20:
  59. return True
  60. else:
  61. return None # not sure
  62. @classmethod
  63. def is_ok(cls, status_id):
  64. return status_id in [0, 1]
  65. broken_enum = {True: 'Broken',
  66. None: 'Not sure if broken',
  67. False: 'Downloaded OK'}
  68. class Archival(Base):
  69. """
  70. Details of the archival of resources. Has the filepath for successfully
  71. archived resources. Basic error history provided for unsuccessful ones.
  72. """
  73. __tablename__ = 'archival'
  74. id = Column(types.UnicodeText, primary_key=True, default=make_uuid)
  75. package_id = Column(types.UnicodeText, nullable=False, index=True)
  76. resource_id = Column(types.UnicodeText, nullable=False, index=True)
  77. resource_timestamp = Column(types.DateTime) # key to resource_revision
  78. # Details of the latest archival attempt
  79. status_id = Column(types.Integer)
  80. is_broken = Column(types.Boolean) # Based on status_id. None = not sure
  81. reason = Column(types.UnicodeText) # Extra detail explaining the status (cannot be translated)
  82. url_redirected_to = Column(types.UnicodeText)
  83. # Details of last successful archival
  84. cache_filepath = Column(types.UnicodeText)
  85. cache_url = Column(types.UnicodeText)
  86. size = Column(types.BigInteger, default=0)
  87. mimetype = Column(types.UnicodeText)
  88. hash = Column(types.UnicodeText)
  89. etag = Column(types.UnicodeText)
  90. last_modified = Column(types.UnicodeText)
  91. # History
  92. first_failure = Column(types.DateTime)
  93. last_success = Column(types.DateTime)
  94. failure_count = Column(types.Integer, default=0)
  95. created = Column(types.DateTime, default=datetime.now)
  96. updated = Column(types.DateTime)
  97. def __repr__(self):
  98. broken_details = '' if not self.is_broken else \
  99. ('%d failures' % self.failure_count)
  100. package = model.Package.get(self.package_id)
  101. package_name = package.name if package else '?%s?' % self.package_id
  102. return '<Archival %s /dataset/%s/resource/%s %s>' % \
  103. (broken_enum[self.is_broken], package_name, self.resource_id,
  104. broken_details)
  105. @classmethod
  106. def get_for_resource(cls, resource_id):
  107. '''Returns the archival for the given resource, or if it doens't exist,
  108. returns None.'''
  109. return model.Session.query(cls).filter(cls.resource_id == resource_id).first()
  110. @classmethod
  111. def get_for_package(cls, package_id):
  112. '''Returns the archivals for the given package. May not be any if the
  113. package has no resources or has not been archived. It checks the
  114. resources are not deleted.'''
  115. return model.Session.query(cls) \
  116. .filter(cls.package_id == package_id) \
  117. .join(model.Resource, cls.resource_id == model.Resource.id) \
  118. .filter(model.Resource.state == 'active') \
  119. .all()
  120. @classmethod
  121. def create(cls, resource_id):
  122. c = cls()
  123. resource = model.Resource.get(resource_id)
  124. c.resource_id = resource_id
  125. c.package_id = resource.package_id
  126. return c
  127. @property
  128. def status(self):
  129. if self.status_id is None:
  130. return None
  131. return Status.by_id(self.status_id)
  132. def as_dict(self):
  133. context = {'model': model}
  134. archival_dict = dictization.table_dictize(self, context)
  135. archival_dict['status'] = self.status
  136. archival_dict['is_broken_printable'] = broken_enum[self.is_broken]
  137. return archival_dict
  138. def aggregate_archivals_for_a_dataset(archivals):
  139. '''Returns aggregated archival info for a dataset, given the archivals for
  140. its resources (returned by get_for_package).
  141. :param archivals: A list of the archivals for a dataset's resources
  142. :type archivals: A list of Archival objects
  143. :returns: Archival dict about the dataset, with keys:
  144. status_id
  145. status
  146. reason
  147. is_broken
  148. '''
  149. archival_dict = {'status_id': None, 'status': None,
  150. 'reason': None, 'is_broken': None}
  151. for archival in archivals:
  152. # status_id takes the highest id i.e. pessimistic
  153. # reason matches the status_id
  154. if archival_dict['status_id'] is None or \
  155. archival.status_id > archival_dict['status_id']:
  156. archival_dict['status_id'] = archival.status_id
  157. archival_dict['reason'] = archival.reason
  158. if archivals:
  159. archival_dict['status'] = Status.by_id(archival_dict['status_id'])
  160. archival_dict['is_broken'] = \
  161. Status.is_status_broken(archival_dict['status_id'])
  162. return archival_dict
  163. def init_tables(engine):
  164. Base.metadata.create_all(engine)
  165. log.info('Archiver database tables are set-up')