test_capture_http_proxy.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. from warcio.capture_http import capture_http
  2. import threading
  3. from wsgiref.simple_server import make_server, WSGIServer
  4. import time
  5. import requests
  6. from warcio.archiveiterator import ArchiveIterator
  7. from pytest import raises
  8. # ==================================================================
  9. class TestCaptureHttpProxy():
  10. def setup(cls):
  11. def app(env, start_response):
  12. result = ('Proxied: ' + env['PATH_INFO']).encode('utf-8')
  13. headers = [('Content-Length', str(len(result)))]
  14. start_response('200 OK', headers=headers)
  15. return iter([result])
  16. from wsgiprox.wsgiprox import WSGIProxMiddleware
  17. wsgiprox = WSGIProxMiddleware(app, '/')
  18. class NoLogServer(WSGIServer):
  19. def handle_error(self, request, client_address):
  20. pass
  21. server = make_server('localhost', 0, wsgiprox, server_class=NoLogServer)
  22. addr, cls.port = server.socket.getsockname()
  23. cls.proxies = {'https': 'localhost:' + str(cls.port),
  24. 'http': 'localhost:' + str(cls.port)
  25. }
  26. def run():
  27. try:
  28. server.serve_forever()
  29. except Exception as e:
  30. print(e)
  31. thread = threading.Thread(target=run)
  32. thread.daemon = True
  33. thread.start()
  34. time.sleep(0.1)
  35. def test_capture_http_proxy(self):
  36. with capture_http() as warc_writer:
  37. res = requests.get("http://example.com/test", proxies=self.proxies, verify=False)
  38. ai = ArchiveIterator(warc_writer.get_stream())
  39. response = next(ai)
  40. assert response.rec_type == 'response'
  41. assert response.rec_headers['WARC-Target-URI'] == "http://example.com/test"
  42. assert response.content_stream().read().decode('utf-8') == 'Proxied: /http://example.com/test'
  43. assert response.rec_headers['WARC-Proxy-Host'] == 'http://localhost:{0}'.format(self.port)
  44. request = next(ai)
  45. assert request.rec_type == 'request'
  46. assert request.rec_headers['WARC-Target-URI'] == "http://example.com/test"
  47. assert request.rec_headers['WARC-Proxy-Host'] == 'http://localhost:{0}'.format(self.port)
  48. with raises(StopIteration):
  49. assert next(ai)
  50. def test_capture_https_proxy(self):
  51. with capture_http() as warc_writer:
  52. res = requests.get("https://example.com/test", proxies=self.proxies, verify=False)
  53. res = requests.get("https://example.com/foo", proxies=self.proxies, verify=False)
  54. # not recording this request
  55. res = requests.get("https://example.com/skip", proxies=self.proxies, verify=False)
  56. with capture_http(warc_writer):
  57. res = requests.get("https://example.com/bar", proxies=self.proxies, verify=False)
  58. ai = ArchiveIterator(warc_writer.get_stream())
  59. response = next(ai)
  60. assert response.rec_type == 'response'
  61. assert response.rec_headers['WARC-Target-URI'] == "https://example.com/test"
  62. assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
  63. assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/test'
  64. request = next(ai)
  65. assert request.rec_type == 'request'
  66. assert request.rec_headers['WARC-Target-URI'] == "https://example.com/test"
  67. assert request.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
  68. response = next(ai)
  69. assert response.rec_type == 'response'
  70. assert response.rec_headers['WARC-Target-URI'] == "https://example.com/foo"
  71. assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
  72. assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/foo'
  73. request = next(ai)
  74. assert request.rec_type == 'request'
  75. assert request.rec_headers['WARC-Target-URI'] == "https://example.com/foo"
  76. assert request.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
  77. response = next(ai)
  78. assert response.rec_type == 'response'
  79. assert response.rec_headers['WARC-Target-URI'] == "https://example.com/bar"
  80. assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
  81. assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/bar'
  82. request = next(ai)
  83. assert request.rec_type == 'request'
  84. with raises(StopIteration):
  85. assert next(ai)
  86. def test_capture_https_proxy_same_session(self):
  87. sesh = requests.session()
  88. with capture_http() as warc_writer:
  89. res = sesh.get("https://example.com/test", proxies=self.proxies, verify=False)
  90. res = sesh.get("https://example.com/foo", proxies=self.proxies, verify=False)
  91. # *will* be captured, as part of same session... (fix this?)
  92. res = sesh.get("https://example.com/skip", proxies=self.proxies, verify=False)
  93. with capture_http(warc_writer):
  94. res = sesh.get("https://example.com/bar", proxies=self.proxies, verify=False)
  95. ai = ArchiveIterator(warc_writer.get_stream())
  96. response = next(ai)
  97. assert response.rec_type == 'response'
  98. assert response.rec_headers['WARC-Target-URI'] == "https://example.com/test"
  99. assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
  100. assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/test'
  101. request = next(ai)
  102. assert request.rec_type == 'request'
  103. assert request.rec_headers['WARC-Target-URI'] == "https://example.com/test"
  104. assert request.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
  105. response = next(ai)
  106. assert response.rec_type == 'response'
  107. assert response.rec_headers['WARC-Target-URI'] == "https://example.com/foo"
  108. assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
  109. assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/foo'
  110. request = next(ai)
  111. assert request.rec_type == 'request'
  112. assert request.rec_headers['WARC-Target-URI'] == "https://example.com/foo"
  113. assert request.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
  114. response = next(ai)
  115. assert response.rec_type == 'response'
  116. assert response.rec_headers['WARC-Target-URI'] == "https://example.com/skip"
  117. assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
  118. assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/skip'
  119. request = next(ai)
  120. assert request.rec_type == 'request'
  121. response = next(ai)
  122. assert response.rec_type == 'response'
  123. assert response.rec_headers['WARC-Target-URI'] == "https://example.com/bar"
  124. assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
  125. assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/bar'
  126. request = next(ai)
  127. assert request.rec_type == 'request'
  128. with raises(StopIteration):
  129. assert next(ai)