cli.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256
  1. # Copyright (c) 2017 crocoite contributors
  2. #
  3. # Permission is hereby granted, free of charge, to any person obtaining a copy
  4. # of this software and associated documentation files (the "Software"), to deal
  5. # in the Software without restriction, including without limitation the rights
  6. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. # copies of the Software, and to permit persons to whom the Software is
  8. # furnished to do so, subject to the following conditions:
  9. #
  10. # The above copyright notice and this permission notice shall be included in
  11. # all copies or substantial portions of the Software.
  12. #
  13. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  19. # THE SOFTWARE.
  20. """
  21. Command line interface
  22. """
  23. import argparse, sys, signal, asyncio, os, json
  24. from traceback import TracebackException
  25. from enum import IntEnum
  26. from yarl import URL
  27. from http.cookies import SimpleCookie
  28. import pkg_resources
  29. try:
  30. import manhole
  31. manhole.install (patch_fork=False, oneshot_on='USR1')
  32. except ModuleNotFoundError:
  33. pass
  34. from . import behavior, browser
  35. from .controller import SinglePageController, \
  36. ControllerSettings, StatsHandler, LogHandler, \
  37. RecursiveController, DepthLimit, PrefixLimit
  38. from .devtools import Passthrough, Process
  39. from .warc import WarcHandler
  40. from .logger import Logger, JsonPrintConsumer, DatetimeConsumer, \
  41. WarcHandlerConsumer, Level
  42. from .devtools import Crashed
  43. def absurl (s):
  44. """ argparse: Absolute URL """
  45. u = URL (s)
  46. if u.is_absolute ():
  47. return u
  48. raise argparse.ArgumentTypeError ('Must be absolute')
  49. def cookie (s):
  50. """ argparse: Cookie """
  51. c = SimpleCookie (s)
  52. # for some reason the constructor does not raise an exception if the cookie
  53. # supplied is invalid. It’ll simply be empty.
  54. if len (c) != 1:
  55. raise argparse.ArgumentTypeError ('Invalid cookie')
  56. # we want a single Morsel
  57. return next (iter (c.values ()))
  58. def cookiejar (f):
  59. """ argparse: Cookies from file """
  60. cookies = []
  61. try:
  62. with open (f, 'r') as fd:
  63. for l in fd:
  64. l = l.lstrip ()
  65. if l and not l.startswith ('#'):
  66. cookies.append (cookie (l))
  67. except FileNotFoundError:
  68. raise argparse.ArgumentTypeError (f'Cookie jar "{f}" does not exist')
  69. return cookies
  70. class SingleExitStatus(IntEnum):
  71. """ Exit status for single-shot command line """
  72. Ok = 0
  73. Fail = 1
  74. BrowserCrash = 2
  75. Navigate = 3
  76. def single ():
  77. parser = argparse.ArgumentParser(description='crocoite helper tools to fetch individual pages.')
  78. parser.add_argument('--browser', help='DevTools URL', type=absurl, metavar='URL')
  79. parser.add_argument('--timeout', default=1*60*60, type=int, help='Maximum time for archival', metavar='SEC')
  80. parser.add_argument('--idle-timeout', default=30, type=int, help='Maximum idle seconds (i.e. no requests)', dest='idleTimeout', metavar='SEC')
  81. parser.add_argument('--behavior', help='Enable behavior script',
  82. dest='enabledBehaviorNames',
  83. default=list (behavior.availableMap.keys ()),
  84. choices=list (behavior.availableMap.keys ()),
  85. metavar='NAME', nargs='*')
  86. parser.add_argument('--warcinfo', help='Add extra information to warcinfo record',
  87. metavar='JSON', type=json.loads)
  88. # re-using curl’s short/long switch names whenever possible
  89. parser.add_argument('-k', '--insecure',
  90. action='store_true',
  91. help='Disable certificate validation')
  92. parser.add_argument ('-b', '--cookie', type=cookie, metavar='SET-COOKIE',
  93. action='append', default=[], help='Cookies in Set-Cookie format.')
  94. parser.add_argument ('-c', '--cookie-jar', dest='cookieJar',
  95. type=cookiejar, metavar='FILE',
  96. default=pkg_resources.resource_filename (__name__, 'data/cookies.txt'),
  97. help='Cookie jar file, read-only.')
  98. parser.add_argument('url', help='Website URL', type=absurl, metavar='URL')
  99. parser.add_argument('output', help='WARC filename', metavar='FILE')
  100. args = parser.parse_args ()
  101. logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()])
  102. ret = SingleExitStatus.Fail
  103. service = Process ()
  104. if args.browser:
  105. service = Passthrough (args.browser)
  106. settings = ControllerSettings (
  107. idleTimeout=args.idleTimeout,
  108. timeout=args.timeout,
  109. insecure=args.insecure,
  110. cookies=args.cookieJar + args.cookie,
  111. )
  112. with open (args.output, 'wb') as fd, WarcHandler (fd, logger) as warcHandler:
  113. logger.connect (WarcHandlerConsumer (warcHandler))
  114. handler = [StatsHandler (), LogHandler (logger), warcHandler]
  115. b = list (map (lambda x: behavior.availableMap[x], args.enabledBehaviorNames))
  116. controller = SinglePageController (url=args.url, settings=settings,
  117. service=service, handler=handler, behavior=b, logger=logger,
  118. warcinfo=args.warcinfo)
  119. try:
  120. loop = asyncio.get_event_loop()
  121. run = asyncio.ensure_future (controller.run ())
  122. stop = lambda signum: run.cancel ()
  123. loop.add_signal_handler (signal.SIGINT, stop, signal.SIGINT)
  124. loop.add_signal_handler (signal.SIGTERM, stop, signal.SIGTERM)
  125. loop.run_until_complete(run)
  126. loop.close()
  127. ret = SingleExitStatus.Ok
  128. except Crashed:
  129. ret = SingleExitStatus.BrowserCrash
  130. except asyncio.CancelledError:
  131. # don’t log this one
  132. pass
  133. except browser.NavigateError:
  134. ret = SingleExitStatus.Navigate
  135. except Exception as e:
  136. ret = SingleExitStatus.Fail
  137. logger.error ('cli exception',
  138. uuid='7fd69858-ecaa-4225-b213-8ab880aa3cc5',
  139. traceback=list (TracebackException.from_exception (e).format ()))
  140. finally:
  141. r = handler[0].stats
  142. logger.info ('stats', context='cli', uuid='24d92d16-770e-4088-b769-4020e127a7ff', **r)
  143. logger.info ('exit', context='cli', uuid='9b1bd603-f7cd-4745-895a-5b894a5166f2', status=ret)
  144. return ret
  145. def parsePolicy (recursive, url):
  146. if recursive is None:
  147. return DepthLimit (0)
  148. elif recursive.isdigit ():
  149. return DepthLimit (int (recursive))
  150. elif recursive == 'prefix':
  151. return PrefixLimit (url)
  152. raise argparse.ArgumentTypeError ('Unsupported recursion mode')
  153. def recursive ():
  154. logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()])
  155. parser = argparse.ArgumentParser(description='Save website to WARC using Google Chrome.')
  156. parser.add_argument('-j', '--concurrency',
  157. help='Run at most N jobs concurrently', metavar='N', default=1,
  158. type=int)
  159. parser.add_argument('-r', '--recursion', help='Recursion policy',
  160. metavar='POLICY')
  161. parser.add_argument('--tempdir', help='Directory for temporary files',
  162. metavar='DIR')
  163. parser.add_argument('url', help='Seed URL', type=absurl, metavar='URL')
  164. parser.add_argument('output',
  165. help='Output file, supports templates {host}, {date} and {seqnum}',
  166. metavar='FILE')
  167. parser.add_argument('command',
  168. help='Fetch command, supports templates {url} and {dest}',
  169. metavar='CMD', nargs='*',
  170. default=['crocoite-single', '{url}', '{dest}'])
  171. args = parser.parse_args ()
  172. try:
  173. policy = parsePolicy (args.recursion, args.url)
  174. except argparse.ArgumentTypeError as e:
  175. parser.error (str (e))
  176. try:
  177. controller = RecursiveController (url=args.url, output=args.output,
  178. command=args.command, logger=logger, policy=policy,
  179. tempdir=args.tempdir, concurrency=args.concurrency)
  180. except ValueError as e:
  181. parser.error (str (e))
  182. run = asyncio.ensure_future (controller.run ())
  183. loop = asyncio.get_event_loop()
  184. stop = lambda signum: run.cancel ()
  185. loop.add_signal_handler (signal.SIGINT, stop, signal.SIGINT)
  186. loop.add_signal_handler (signal.SIGTERM, stop, signal.SIGTERM)
  187. try:
  188. loop.run_until_complete(run)
  189. except asyncio.CancelledError:
  190. pass
  191. finally:
  192. loop.close()
  193. return 0
  194. def irc ():
  195. import json, re
  196. from .irc import Chromebot
  197. logger = Logger (consumer=[DatetimeConsumer (), JsonPrintConsumer ()])
  198. parser = argparse.ArgumentParser(description='IRC bot.')
  199. parser.add_argument('--config', '-c', help='Config file location', metavar='PATH', default='chromebot.json')
  200. args = parser.parse_args ()
  201. with open (args.config) as fd:
  202. config = json.load (fd)
  203. s = config['irc']
  204. blacklist = dict (map (lambda x: (re.compile (x[0], re.I), x[1]), config['blacklist'].items ()))
  205. loop = asyncio.get_event_loop()
  206. bot = Chromebot (
  207. host=s['host'],
  208. port=s['port'],
  209. ssl=s['ssl'],
  210. nick=s['nick'],
  211. channels=s['channels'],
  212. tempdir=config['tempdir'],
  213. destdir=config['destdir'],
  214. processLimit=config['process_limit'],
  215. logger=logger,
  216. blacklist=blacklist,
  217. needVoice=config['need_voice'],
  218. loop=loop)
  219. stop = lambda signum: bot.cancel ()
  220. loop.add_signal_handler (signal.SIGINT, stop, signal.SIGINT)
  221. loop.add_signal_handler (signal.SIGTERM, stop, signal.SIGTERM)
  222. loop.run_until_complete(bot.run ())
  223. def dashboard ():
  224. from .irc import Dashboard
  225. loop = asyncio.get_event_loop()
  226. d = Dashboard (sys.stdin, loop)
  227. loop.run_until_complete(d.run ())
  228. loop.run_forever()