123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167 |
- from warcio.capture_http import capture_http
- import threading
- from wsgiref.simple_server import make_server, WSGIServer
- import time
- import requests
- from warcio.archiveiterator import ArchiveIterator
- from pytest import raises
- # ==================================================================
- class TestCaptureHttpProxy():
- def setup(cls):
- def app(env, start_response):
- result = ('Proxied: ' + env['PATH_INFO']).encode('utf-8')
- headers = [('Content-Length', str(len(result)))]
- start_response('200 OK', headers=headers)
- return iter([result])
- from wsgiprox.wsgiprox import WSGIProxMiddleware
- wsgiprox = WSGIProxMiddleware(app, '/')
- class NoLogServer(WSGIServer):
- def handle_error(self, request, client_address):
- pass
- server = make_server('localhost', 0, wsgiprox, server_class=NoLogServer)
- addr, cls.port = server.socket.getsockname()
- cls.proxies = {'https': 'localhost:' + str(cls.port),
- 'http': 'localhost:' + str(cls.port)
- }
- def run():
- try:
- server.serve_forever()
- except Exception as e:
- print(e)
- thread = threading.Thread(target=run)
- thread.daemon = True
- thread.start()
- time.sleep(0.1)
- def test_capture_http_proxy(self):
- with capture_http() as warc_writer:
- res = requests.get("http://example.com/test", proxies=self.proxies, verify=False)
- ai = ArchiveIterator(warc_writer.get_stream())
- response = next(ai)
- assert response.rec_type == 'response'
- assert response.rec_headers['WARC-Target-URI'] == "http://example.com/test"
- assert response.content_stream().read().decode('utf-8') == 'Proxied: /http://example.com/test'
- assert response.rec_headers['WARC-Proxy-Host'] == 'http://localhost:{0}'.format(self.port)
- request = next(ai)
- assert request.rec_type == 'request'
- assert request.rec_headers['WARC-Target-URI'] == "http://example.com/test"
- assert request.rec_headers['WARC-Proxy-Host'] == 'http://localhost:{0}'.format(self.port)
- with raises(StopIteration):
- assert next(ai)
- def test_capture_https_proxy(self):
- with capture_http() as warc_writer:
- res = requests.get("https://example.com/test", proxies=self.proxies, verify=False)
- res = requests.get("https://example.com/foo", proxies=self.proxies, verify=False)
- # not recording this request
- res = requests.get("https://example.com/skip", proxies=self.proxies, verify=False)
- with capture_http(warc_writer):
- res = requests.get("https://example.com/bar", proxies=self.proxies, verify=False)
- ai = ArchiveIterator(warc_writer.get_stream())
- response = next(ai)
- assert response.rec_type == 'response'
- assert response.rec_headers['WARC-Target-URI'] == "https://example.com/test"
- assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
- assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/test'
- request = next(ai)
- assert request.rec_type == 'request'
- assert request.rec_headers['WARC-Target-URI'] == "https://example.com/test"
- assert request.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
- response = next(ai)
- assert response.rec_type == 'response'
- assert response.rec_headers['WARC-Target-URI'] == "https://example.com/foo"
- assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
- assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/foo'
- request = next(ai)
- assert request.rec_type == 'request'
- assert request.rec_headers['WARC-Target-URI'] == "https://example.com/foo"
- assert request.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
- response = next(ai)
- assert response.rec_type == 'response'
- assert response.rec_headers['WARC-Target-URI'] == "https://example.com/bar"
- assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
- assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/bar'
- request = next(ai)
- assert request.rec_type == 'request'
- with raises(StopIteration):
- assert next(ai)
- def test_capture_https_proxy_same_session(self):
- sesh = requests.session()
- with capture_http() as warc_writer:
- res = sesh.get("https://example.com/test", proxies=self.proxies, verify=False)
- res = sesh.get("https://example.com/foo", proxies=self.proxies, verify=False)
- # *will* be captured, as part of same session... (fix this?)
- res = sesh.get("https://example.com/skip", proxies=self.proxies, verify=False)
- with capture_http(warc_writer):
- res = sesh.get("https://example.com/bar", proxies=self.proxies, verify=False)
- ai = ArchiveIterator(warc_writer.get_stream())
- response = next(ai)
- assert response.rec_type == 'response'
- assert response.rec_headers['WARC-Target-URI'] == "https://example.com/test"
- assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
- assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/test'
- request = next(ai)
- assert request.rec_type == 'request'
- assert request.rec_headers['WARC-Target-URI'] == "https://example.com/test"
- assert request.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
- response = next(ai)
- assert response.rec_type == 'response'
- assert response.rec_headers['WARC-Target-URI'] == "https://example.com/foo"
- assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
- assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/foo'
- request = next(ai)
- assert request.rec_type == 'request'
- assert request.rec_headers['WARC-Target-URI'] == "https://example.com/foo"
- assert request.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
- response = next(ai)
- assert response.rec_type == 'response'
- assert response.rec_headers['WARC-Target-URI'] == "https://example.com/skip"
- assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
- assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/skip'
- request = next(ai)
- assert request.rec_type == 'request'
- response = next(ai)
- assert response.rec_type == 'response'
- assert response.rec_headers['WARC-Target-URI'] == "https://example.com/bar"
- assert response.rec_headers['WARC-Proxy-Host'] == 'https://localhost:{0}'.format(self.port)
- assert response.content_stream().read().decode('utf-8') == 'Proxied: /https://example.com/bar'
- request = next(ai)
- assert request.rec_type == 'request'
- with raises(StopIteration):
- assert next(ai)
|