reports.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. import copy
  2. try:
  3. from collections import OrderedDict # from python 2.7
  4. except ImportError:
  5. from sqlalchemy.util import OrderedDict
  6. from ckan.common import _
  7. import ckan.model as model
  8. import ckan.plugins as p
  9. from ckanext.report import lib
  10. def broken_links(organization, include_sub_organizations=False):
  11. if organization is None:
  12. return broken_links_index(include_sub_organizations=include_sub_organizations)
  13. else:
  14. return broken_links_for_organization(organization=organization, include_sub_organizations=include_sub_organizations)
  15. def broken_links_index(include_sub_organizations=False):
  16. '''Returns the count of broken links for all organizations.'''
  17. from ckanext.archiver.model import Archival
  18. counts = {}
  19. # Get all the broken datasets and build up the results by org
  20. orgs = model.Session.query(model.Group)\
  21. .filter(model.Group.type == 'organization')\
  22. .filter(model.Group.state == 'active').all()
  23. for org in add_progress_bar(
  24. orgs, 'Part 1/2' if include_sub_organizations else None):
  25. archivals = (model.Session.query(Archival)
  26. .filter(Archival.is_broken == True) # noqa
  27. .join(model.Package, Archival.package_id == model.Package.id)
  28. .filter(model.Package.owner_org == org.id)
  29. .filter(model.Package.state == 'active')
  30. .join(model.Resource, Archival.resource_id == model.Resource.id)
  31. .filter(model.Resource.state == 'active'))
  32. broken_resources = archivals.count()
  33. broken_datasets = archivals.distinct(model.Package.id).count()
  34. num_datasets = model.Session.query(model.Package)\
  35. .filter_by(owner_org=org.id)\
  36. .filter_by(state='active')\
  37. .count()
  38. num_resources = model.Session.query(model.Package)\
  39. .filter_by(owner_org=org.id)\
  40. .filter_by(state='active')
  41. if p.toolkit.check_ckan_version(max_version='2.2.99'):
  42. num_resources = num_resources.join(model.ResourceGroup)
  43. num_resources = num_resources \
  44. .join(model.Resource)\
  45. .filter_by(state='active')\
  46. .count()
  47. counts[org.name] = {
  48. 'organization_title': org.title,
  49. 'broken_packages': broken_datasets,
  50. 'broken_resources': broken_resources,
  51. 'packages': num_datasets,
  52. 'resources': num_resources
  53. }
  54. counts_with_sub_orgs = copy.deepcopy(counts) # new dict
  55. if include_sub_organizations:
  56. for org_name in add_progress_bar(counts_with_sub_orgs, 'Part 2/2'):
  57. org = model.Group.by_name(org_name)
  58. for sub_org_id, sub_org_name, sub_org_title, sub_org_parent_id \
  59. in org.get_children_group_hierarchy(type='organization'):
  60. if sub_org_name not in counts:
  61. # occurs only if there is an organization created since the last loop?
  62. continue
  63. counts_with_sub_orgs[org_name]['broken_packages'] += \
  64. counts[sub_org_name]['broken_packages']
  65. counts_with_sub_orgs[org_name]['broken_resources'] += \
  66. counts[sub_org_name]['broken_resources']
  67. counts_with_sub_orgs[org_name]['packages'] += \
  68. counts[sub_org_name]['packages']
  69. counts_with_sub_orgs[org_name]['resources'] += \
  70. counts[sub_org_name]['resources']
  71. results = counts_with_sub_orgs
  72. else:
  73. results = counts
  74. data = []
  75. num_broken_packages = 0
  76. num_broken_resources = 0
  77. num_packages = 0
  78. num_resources = 0
  79. for org_name, org_counts in results.items():
  80. data.append(OrderedDict((
  81. ('organization_title', results[org_name]['organization_title']),
  82. ('organization_name', org_name),
  83. ('package_count', org_counts['packages']),
  84. ('resource_count', org_counts['resources']),
  85. ('broken_package_count', org_counts['broken_packages']),
  86. ('broken_package_percent', lib.percent(org_counts['broken_packages'], org_counts['packages'])),
  87. ('broken_resource_count', org_counts['broken_resources']),
  88. ('broken_resource_percent', lib.percent(org_counts['broken_resources'], org_counts['resources'])),
  89. )))
  90. # Totals - always use the counts, rather than counts_with_sub_orgs, to
  91. # avoid counting a package in both its org and parent org
  92. org_counts_ = counts[org_name]
  93. num_broken_packages += org_counts_['broken_packages']
  94. num_broken_resources += org_counts_['broken_resources']
  95. num_packages += org_counts_['packages']
  96. num_resources += org_counts_['resources']
  97. data.sort(key=lambda x: (-x['broken_package_count'],
  98. -x['broken_resource_count']))
  99. return {'table': data,
  100. 'num_broken_packages': num_broken_packages,
  101. 'num_broken_resources': num_broken_resources,
  102. 'num_packages': num_packages,
  103. 'num_resources': num_resources,
  104. 'broken_package_percent': lib.percent(num_broken_packages, num_packages),
  105. 'broken_resource_percent': lib.percent(num_broken_resources, num_resources),
  106. }
  107. def broken_links_for_organization(organization, include_sub_organizations=False):
  108. '''
  109. Returns a dictionary detailing broken resource links for the organization
  110. or if organization it returns the index page for all organizations.
  111. params:
  112. organization - name of an organization
  113. Returns:
  114. {'organization_name': 'cabinet-office',
  115. 'organization_title:': 'Cabinet Office',
  116. 'table': [
  117. {'package_name', 'package_title', 'resource_url', 'status', 'reason', 'last_success',
  118. 'first_failure', 'failure_count', 'last_updated'}
  119. ...]
  120. '''
  121. from ckanext.archiver.model import Archival
  122. org = model.Group.get(organization)
  123. if not org:
  124. raise p.toolkit.ObjectNotFound()
  125. name = org.name
  126. title = org.title
  127. archivals = (model.Session.query(Archival, model.Package, model.Group).
  128. filter(Archival.is_broken == True). # noqa
  129. join(model.Package, Archival.package_id == model.Package.id).
  130. filter(model.Package.state == 'active').
  131. join(model.Resource, Archival.resource_id == model.Resource.id).
  132. filter(model.Resource.state == 'active'))
  133. if not include_sub_organizations:
  134. org_ids = [org.id]
  135. archivals = archivals.filter(model.Package.owner_org == org.id)
  136. else:
  137. # We want any organization_id that is part of this organization's tree
  138. org_ids = ['%s' % child_org.id for child_org in lib.go_down_tree(org)]
  139. archivals = archivals.filter(model.Package.owner_org.in_(org_ids))
  140. archivals = archivals.join(model.Group, model.Package.owner_org == model.Group.id)
  141. results = []
  142. for archival, pkg, org in archivals.all():
  143. pkg = model.Package.get(archival.package_id)
  144. resource = model.Resource.get(archival.resource_id)
  145. via = ''
  146. er = pkg.extras.get('external_reference', '')
  147. if er == 'ONSHUB':
  148. via = "Stats Hub"
  149. elif er.startswith("DATA4NR"):
  150. via = "Data4nr"
  151. # CKAN 2.9 does not have revisions
  152. if p.toolkit.check_ckan_version(max_version="2.8.99"):
  153. archived_resource = model.Session.query(model.ResourceRevision)\
  154. .filter_by(id=resource.id)\
  155. .filter_by(revision_timestamp=archival.resource_timestamp)\
  156. .first() or resource
  157. else:
  158. archived_resource = resource
  159. row_data = OrderedDict((
  160. ('dataset_title', pkg.title),
  161. ('dataset_name', pkg.name),
  162. ('dataset_notes', lib.dataset_notes(pkg)),
  163. ('organization_title', org.title),
  164. ('organization_name', org.name),
  165. ('resource_position', resource.position),
  166. ('resource_id', resource.id),
  167. ('resource_url', archived_resource.url),
  168. ('url_up_to_date', resource.url == archived_resource.url),
  169. ('via', via),
  170. ('first_failure', archival.first_failure.isoformat() if archival.first_failure else None),
  171. ('last_updated', archival.updated.isoformat() if archival.updated else None),
  172. ('last_success', archival.last_success.isoformat() if archival.last_success else None),
  173. ('url_redirected_to', archival.url_redirected_to),
  174. ('reason', archival.reason),
  175. ('status', archival.status),
  176. ('failure_count', archival.failure_count),
  177. ))
  178. results.append(row_data)
  179. num_broken_packages = archivals.distinct(model.Package.name).count()
  180. num_broken_resources = len(results)
  181. # Get total number of packages & resources
  182. num_packages = model.Session.query(model.Package)\
  183. .filter(model.Package.owner_org.in_(org_ids))\
  184. .filter_by(state='active')\
  185. .count()
  186. num_resources = model.Session.query(model.Resource)\
  187. .filter_by(state='active')
  188. if p.toolkit.check_ckan_version(max_version='2.2.99'):
  189. num_resources = num_resources.join(model.ResourceGroup)
  190. num_resources = num_resources \
  191. .join(model.Package)\
  192. .filter(model.Package.owner_org.in_(org_ids))\
  193. .filter_by(state='active').count()
  194. return {'organization_name': name,
  195. 'organization_title': title,
  196. 'num_broken_packages': num_broken_packages,
  197. 'num_broken_resources': num_broken_resources,
  198. 'num_packages': num_packages,
  199. 'num_resources': num_resources,
  200. 'broken_package_percent': lib.percent(num_broken_packages, num_packages),
  201. 'broken_resource_percent': lib.percent(num_broken_resources, num_resources),
  202. 'table': results}
  203. def broken_links_option_combinations():
  204. for organization in lib.all_organizations(include_none=True):
  205. for include_sub_organizations in (False, True):
  206. yield {'organization': organization,
  207. 'include_sub_organizations': include_sub_organizations}
  208. broken_links_report_info = {
  209. 'name': 'broken-links',
  210. 'title': _('Broken links'),
  211. 'description': _('Dataset resource URLs that are found to result in errors when resolved.'),
  212. 'option_defaults': OrderedDict((('organization', None),
  213. ('include_sub_organizations', False),
  214. )),
  215. 'option_combinations': broken_links_option_combinations,
  216. 'generate': broken_links,
  217. 'template': 'report/broken_links.html',
  218. }
  219. def add_progress_bar(iterable, caption=None):
  220. try:
  221. # Add a progress bar, if it is installed
  222. import progressbar
  223. bar = progressbar.ProgressBar(widgets=[
  224. (caption + ' ') if caption else '',
  225. progressbar.Percentage(), ' ',
  226. progressbar.Bar(), ' ', progressbar.ETA()])
  227. return bar(iterable)
  228. except ImportError:
  229. return iterable