diff options
author | Thomas Kriechbaumer <thomas@kriechbaumer.name> | 2015-08-01 10:39:14 +0200 |
---|---|---|
committer | Thomas Kriechbaumer <thomas@kriechbaumer.name> | 2015-08-01 12:40:40 +0200 |
commit | a837230320378d629ba9f25960b1dfd25c892ad9 (patch) | |
tree | eca444b3965abb294c6304ec41de2fbc307e240f /netlib | |
parent | 199f2a44fed6b5f1c6fada6c96b981dfab5fded2 (diff) | |
download | mitmproxy-a837230320378d629ba9f25960b1dfd25c892ad9.tar.gz mitmproxy-a837230320378d629ba9f25960b1dfd25c892ad9.tar.bz2 mitmproxy-a837230320378d629ba9f25960b1dfd25c892ad9.zip |
move code from mitmproxy to netlib
Diffstat (limited to 'netlib')
-rw-r--r-- | netlib/encoding.py | 82 | ||||
-rw-r--r-- | netlib/http/exceptions.py | 13 | ||||
-rw-r--r-- | netlib/http/http1/protocol.py | 39 | ||||
-rw-r--r-- | netlib/http/semantics.py | 366 | ||||
-rw-r--r-- | netlib/tutils.py | 125 | ||||
-rw-r--r-- | netlib/utils.py | 100 |
6 files changed, 616 insertions, 109 deletions
diff --git a/netlib/encoding.py b/netlib/encoding.py new file mode 100644 index 00000000..f107eb5f --- /dev/null +++ b/netlib/encoding.py @@ -0,0 +1,82 @@ +""" + Utility functions for decoding response bodies. +""" +from __future__ import absolute_import +import cStringIO +import gzip +import zlib + +__ALL__ = ["ENCODINGS"] + +ENCODINGS = set(["identity", "gzip", "deflate"]) + + +def decode(e, content): + encoding_map = { + "identity": identity, + "gzip": decode_gzip, + "deflate": decode_deflate, + } + if e not in encoding_map: + return None + return encoding_map[e](content) + + +def encode(e, content): + encoding_map = { + "identity": identity, + "gzip": encode_gzip, + "deflate": encode_deflate, + } + if e not in encoding_map: + return None + return encoding_map[e](content) + + +def identity(content): + """ + Returns content unchanged. Identity is the default value of + Accept-Encoding headers. + """ + return content + + +def decode_gzip(content): + gfile = gzip.GzipFile(fileobj=cStringIO.StringIO(content)) + try: + return gfile.read() + except (IOError, EOFError): + return None + + +def encode_gzip(content): + s = cStringIO.StringIO() + gf = gzip.GzipFile(fileobj=s, mode='wb') + gf.write(content) + gf.close() + return s.getvalue() + + +def decode_deflate(content): + """ + Returns decompressed data for DEFLATE. Some servers may respond with + compressed data without a zlib header or checksum. An undocumented + feature of zlib permits the lenient decompression of data missing both + values. + + http://bugs.python.org/issue5784 + """ + try: + try: + return zlib.decompress(content) + except zlib.error: + return zlib.decompress(content, -15) + except zlib.error: + return None + + +def encode_deflate(content): + """ + Returns compressed content, always including zlib header and checksum. + """ + return zlib.compress(content) diff --git a/netlib/http/exceptions.py b/netlib/http/exceptions.py index 8a2bbebc..45bd2dce 100644 --- a/netlib/http/exceptions.py +++ b/netlib/http/exceptions.py @@ -7,3 +7,16 @@ class HttpError(Exception): class HttpErrorConnClosed(HttpError): pass + + + +class HttpAuthenticationError(Exception): + def __init__(self, auth_headers=None): + super(HttpAuthenticationError, self).__init__( + "Proxy Authentication Required" + ) + self.headers = auth_headers + self.code = 407 + + def __repr__(self): + return "Proxy Authentication Required" diff --git a/netlib/http/http1/protocol.py b/netlib/http/http1/protocol.py index b098110a..a189bffc 100644 --- a/netlib/http/http1/protocol.py +++ b/netlib/http/http1/protocol.py @@ -375,7 +375,7 @@ class HTTP1Protocol(semantics.ProtocolMixin): @classmethod def has_chunked_encoding(self, headers): return "chunked" in [ - i.lower() for i in http.get_header_tokens(headers, "transfer-encoding") + i.lower() for i in utils.get_header_tokens(headers, "transfer-encoding") ] @@ -482,9 +482,9 @@ class HTTP1Protocol(semantics.ProtocolMixin): port = int(port) except ValueError: return None - if not http.is_valid_port(port): + if not utils.is_valid_port(port): return None - if not http.is_valid_host(host): + if not utils.is_valid_host(host): return None return host, port, httpversion @@ -496,7 +496,7 @@ class HTTP1Protocol(semantics.ProtocolMixin): return None method, url, httpversion = v - parts = http.parse_url(url) + parts = utils.parse_url(url) if not parts: return None scheme, host, port, path = parts @@ -528,7 +528,7 @@ class HTTP1Protocol(semantics.ProtocolMixin): """ # At first, check if we have an explicit Connection header. if "connection" in headers: - toks = http.get_header_tokens(headers, "connection") + toks = utils.get_header_tokens(headers, "connection") if "close" in toks: return True elif "keep-alive" in toks: @@ -556,34 +556,7 @@ class HTTP1Protocol(semantics.ProtocolMixin): @classmethod def _assemble_request_first_line(self, request): - if request.form_in == "relative": - request_line = '%s %s HTTP/%s.%s' % ( - request.method, - request.path, - request.httpversion[0], - request.httpversion[1], - ) - elif request.form_in == "authority": - request_line = '%s %s:%s HTTP/%s.%s' % ( - request.method, - request.host, - request.port, - request.httpversion[0], - request.httpversion[1], - ) - elif request.form_in == "absolute": - request_line = '%s %s://%s:%s%s HTTP/%s.%s' % ( - request.method, - request.scheme, - request.host, - request.port, - request.path, - request.httpversion[0], - request.httpversion[1], - ) - else: - raise http.HttpError(400, "Invalid request form") - return request_line + return request.legacy_first_line() def _assemble_request_headers(self, request): headers = request.headers.copy() diff --git a/netlib/http/semantics.py b/netlib/http/semantics.py index 54bf83d2..e7ae2b5f 100644 --- a/netlib/http/semantics.py +++ b/netlib/http/semantics.py @@ -3,9 +3,15 @@ import binascii import collections import string import sys +import urllib import urlparse from .. import utils, odict +from . import cookies +from netlib import utils, encoding + +HDR_FORM_URLENCODED = "application/x-www-form-urlencoded" +HDR_FORM_MULTIPART = "multipart/form-data" CONTENT_MISSING = 0 @@ -75,7 +81,240 @@ class Request(object): return False def __repr__(self): - return "Request(%s - %s, %s)" % (self.method, self.host, self.path) + # return "Request(%s - %s, %s)" % (self.method, self.host, self.path) + + return "<HTTPRequest: {0}>".format( + self.legacy_first_line()[:-9] + ) + + def legacy_first_line(self): + if self.form_in == "relative": + return '%s %s HTTP/%s.%s' % ( + self.method, + self.path, + self.httpversion[0], + self.httpversion[1], + ) + elif self.form_in == "authority": + return '%s %s:%s HTTP/%s.%s' % ( + self.method, + self.host, + self.port, + self.httpversion[0], + self.httpversion[1], + ) + elif self.form_in == "absolute": + return '%s %s://%s:%s%s HTTP/%s.%s' % ( + self.method, + self.scheme, + self.host, + self.port, + self.path, + self.httpversion[0], + self.httpversion[1], + ) + else: + raise http.HttpError(400, "Invalid request form") + + def anticache(self): + """ + Modifies this request to remove headers that might produce a cached + response. That is, we remove ETags and If-Modified-Since headers. + """ + delheaders = [ + "if-modified-since", + "if-none-match", + ] + for i in delheaders: + del self.headers[i] + + def anticomp(self): + """ + Modifies this request to remove headers that will compress the + resource's data. + """ + self.headers["accept-encoding"] = ["identity"] + + def constrain_encoding(self): + """ + Limits the permissible Accept-Encoding values, based on what we can + decode appropriately. + """ + if self.headers["accept-encoding"]: + self.headers["accept-encoding"] = [ + ', '.join( + e for e in encoding.ENCODINGS if e in self.headers["accept-encoding"][0])] + + def update_host_header(self): + """ + Update the host header to reflect the current target. + """ + self.headers["Host"] = [self.host] + + def get_form(self): + """ + Retrieves the URL-encoded or multipart form data, returning an ODict object. + Returns an empty ODict if there is no data or the content-type + indicates non-form data. + """ + if self.body: + if self.headers.in_any("content-type", HDR_FORM_URLENCODED, True): + return self.get_form_urlencoded() + elif self.headers.in_any("content-type", HDR_FORM_MULTIPART, True): + return self.get_form_multipart() + return odict.ODict([]) + + def get_form_urlencoded(self): + """ + Retrieves the URL-encoded form data, returning an ODict object. + Returns an empty ODict if there is no data or the content-type + indicates non-form data. + """ + if self.body and self.headers.in_any( + "content-type", + HDR_FORM_URLENCODED, + True): + return odict.ODict(utils.urldecode(self.body)) + return odict.ODict([]) + + def get_form_multipart(self): + if self.body and self.headers.in_any( + "content-type", + HDR_FORM_MULTIPART, + True): + return odict.ODict( + utils.multipartdecode( + self.headers, + self.body)) + return odict.ODict([]) + + def set_form_urlencoded(self, odict): + """ + Sets the body to the URL-encoded form data, and adds the + appropriate content-type header. Note that this will destory the + existing body if there is one. + """ + # FIXME: If there's an existing content-type header indicating a + # url-encoded form, leave it alone. + self.headers["Content-Type"] = [HDR_FORM_URLENCODED] + self.body = utils.urlencode(odict.lst) + + def get_path_components(self): + """ + Returns the path components of the URL as a list of strings. + + Components are unquoted. + """ + _, _, path, _, _, _ = urlparse.urlparse(self.url) + return [urllib.unquote(i) for i in path.split("/") if i] + + def set_path_components(self, lst): + """ + Takes a list of strings, and sets the path component of the URL. + + Components are quoted. + """ + lst = [urllib.quote(i, safe="") for i in lst] + path = "/" + "/".join(lst) + scheme, netloc, _, params, query, fragment = urlparse.urlparse(self.url) + self.url = urlparse.urlunparse( + [scheme, netloc, path, params, query, fragment] + ) + + def get_query(self): + """ + Gets the request query string. Returns an ODict object. + """ + _, _, _, _, query, _ = urlparse.urlparse(self.url) + if query: + return odict.ODict(utils.urldecode(query)) + return odict.ODict([]) + + def set_query(self, odict): + """ + Takes an ODict object, and sets the request query string. + """ + scheme, netloc, path, params, _, fragment = urlparse.urlparse(self.url) + query = utils.urlencode(odict.lst) + self.url = urlparse.urlunparse( + [scheme, netloc, path, params, query, fragment] + ) + + def pretty_host(self, hostheader): + """ + Heuristic to get the host of the request. + + Note that pretty_host() does not always return the TCP destination + of the request, e.g. if an upstream proxy is in place + + If hostheader is set to True, the Host: header will be used as + additional (and preferred) data source. This is handy in + transparent mode, where only the IO of the destination is known, + but not the resolved name. This is disabled by default, as an + attacker may spoof the host header to confuse an analyst. + """ + host = None + if hostheader: + host = self.headers.get_first("host") + if not host: + host = self.host + if host: + try: + return host.encode("idna") + except ValueError: + return host + else: + return None + + def pretty_url(self, hostheader): + if self.form_out == "authority": # upstream proxy mode + return "%s:%s" % (self.pretty_host(hostheader), self.port) + return utils.unparse_url(self.scheme, + self.pretty_host(hostheader), + self.port, + self.path).encode('ascii') + + def get_cookies(self): + """ + Returns a possibly empty netlib.odict.ODict object. + """ + ret = odict.ODict() + for i in self.headers["cookie"]: + ret.extend(cookies.parse_cookie_header(i)) + return ret + + def set_cookies(self, odict): + """ + Takes an netlib.odict.ODict object. Over-writes any existing Cookie + headers. + """ + v = cookies.format_cookie_header(odict) + self.headers["Cookie"] = [v] + + @property + def url(self): + """ + Returns a URL string, constructed from the Request's URL components. + """ + return utils.unparse_url( + self.scheme, + self.host, + self.port, + self.path + ).encode('ascii') + + @url.setter + def url(self, url): + """ + Parses a URL specification, and updates the Request's information + accordingly. + + Returns False if the URL was invalid, True if the request succeeded. + """ + parts = utils.parse_url(url) + if not parts: + raise ValueError("Invalid URL: %s" % url) + self.scheme, self.host, self.port, self.path = parts @property def content(self): @@ -139,7 +378,56 @@ class Response(object): return False def __repr__(self): - return "Response(%s - %s)" % (self.status_code, self.msg) + # return "Response(%s - %s)" % (self.status_code, self.msg) + + if self.body: + size = utils.pretty_size(len(self.body)) + else: + size = "content missing" + return "<HTTPResponse: {status_code} {msg} ({contenttype}, {size})>".format( + status_code=self.status_code, + msg=self.msg, + contenttype=self.headers.get_first( + "content-type", "unknown content type" + ), + size=size + ) + + + def get_cookies(self): + """ + Get the contents of all Set-Cookie headers. + + Returns a possibly empty ODict, where keys are cookie name strings, + and values are [value, attr] lists. Value is a string, and attr is + an ODictCaseless containing cookie attributes. Within attrs, unary + attributes (e.g. HTTPOnly) are indicated by a Null value. + """ + ret = [] + for header in self.headers["set-cookie"]: + v = cookies.parse_set_cookie_header(header) + if v: + name, value, attrs = v + ret.append([name, [value, attrs]]) + return odict.ODict(ret) + + def set_cookies(self, odict): + """ + Set the Set-Cookie headers on this response, over-writing existing + headers. + + Accepts an ODict of the same format as that returned by get_cookies. + """ + values = [] + for i in odict.lst: + values.append( + cookies.format_set_cookie_header( + i[0], + i[1][0], + i[1][1] + ) + ) + self.headers["Set-Cookie"] = values @property def content(self): @@ -160,77 +448,3 @@ class Response(object): def code(self, code): # TODO: remove deprecated setter self.status_code = code - - - -def is_valid_port(port): - if not 0 <= port <= 65535: - return False - return True - - -def is_valid_host(host): - try: - host.decode("idna") - except ValueError: - return False - if "\0" in host: - return None - return True - - -def parse_url(url): - """ - Returns a (scheme, host, port, path) tuple, or None on error. - - Checks that: - port is an integer 0-65535 - host is a valid IDNA-encoded hostname with no null-bytes - path is valid ASCII - """ - try: - scheme, netloc, path, params, query, fragment = urlparse.urlparse(url) - except ValueError: - return None - if not scheme: - return None - if '@' in netloc: - # FIXME: Consider what to do with the discarded credentials here Most - # probably we should extend the signature to return these as a separate - # value. - _, netloc = string.rsplit(netloc, '@', maxsplit=1) - if ':' in netloc: - host, port = string.rsplit(netloc, ':', maxsplit=1) - try: - port = int(port) - except ValueError: - return None - else: - host = netloc - if scheme == "https": - port = 443 - else: - port = 80 - path = urlparse.urlunparse(('', '', path, params, query, fragment)) - if not path.startswith("/"): - path = "/" + path - if not is_valid_host(host): - return None - if not utils.isascii(path): - return None - if not is_valid_port(port): - return None - return scheme, host, port, path - - -def get_header_tokens(headers, key): - """ - Retrieve all tokens for a header key. A number of different headers - follow a pattern where each header line can containe comma-separated - tokens, and headers can be set multiple times. - """ - toks = [] - for i in headers[key]: - for j in i.split(","): - toks.append(j.strip()) - return toks diff --git a/netlib/tutils.py b/netlib/tutils.py new file mode 100644 index 00000000..5018b9e8 --- /dev/null +++ b/netlib/tutils.py @@ -0,0 +1,125 @@ +import cStringIO +import tempfile +import os +import time +import shutil +from contextlib import contextmanager + +from netlib import tcp, utils, odict, http + + +def treader(bytes): + """ + Construct a tcp.Read object from bytes. + """ + fp = cStringIO.StringIO(bytes) + return tcp.Reader(fp) + + +@contextmanager +def tmpdir(*args, **kwargs): + orig_workdir = os.getcwd() + temp_workdir = tempfile.mkdtemp(*args, **kwargs) + os.chdir(temp_workdir) + + yield temp_workdir + + os.chdir(orig_workdir) + shutil.rmtree(temp_workdir) + + +def raises(exc, obj, *args, **kwargs): + """ + Assert that a callable raises a specified exception. + + :exc An exception class or a string. If a class, assert that an + exception of this type is raised. If a string, assert that the string + occurs in the string representation of the exception, based on a + case-insenstivie match. + + :obj A callable object. + + :args Arguments to be passsed to the callable. + + :kwargs Arguments to be passed to the callable. + """ + try: + ret = obj(*args, **kwargs) + except Exception as v: + if isinstance(exc, basestring): + if exc.lower() in str(v).lower(): + return + else: + raise AssertionError( + "Expected %s, but caught %s" % ( + repr(str(exc)), v + ) + ) + else: + if isinstance(v, exc): + return + else: + raise AssertionError( + "Expected %s, but caught %s %s" % ( + exc.__name__, v.__class__.__name__, str(v) + ) + ) + raise AssertionError("No exception raised. Return value: {}".format(ret)) + +test_data = utils.Data(__name__) + + + + +def treq(content="content", scheme="http", host="address", port=22): + """ + @return: libmproxy.protocol.http.HTTPRequest + """ + headers = odict.ODictCaseless() + headers["header"] = ["qvalue"] + req = http.Request( + "relative", + "GET", + scheme, + host, + port, + "/path", + (1, 1), + headers, + content, + None, + None, + ) + return req + + +def treq_absolute(content="content"): + """ + @return: libmproxy.protocol.http.HTTPRequest + """ + r = treq(content) + r.form_in = r.form_out = "absolute" + r.host = "address" + r.port = 22 + r.scheme = "http" + return r + + +def tresp(content="message"): + """ + @return: libmproxy.protocol.http.HTTPResponse + """ + + headers = odict.ODictCaseless() + headers["header_response"] = ["svalue"] + + resp = http.semantics.Response( + (1, 1), + 200, + "OK", + headers, + content, + time.time(), + time.time(), + ) + return resp diff --git a/netlib/utils.py b/netlib/utils.py index 86e33f33..39354605 100644 --- a/netlib/utils.py +++ b/netlib/utils.py @@ -1,5 +1,10 @@ from __future__ import (absolute_import, print_function, division) import os.path +import cgi +import urllib +import urlparse +import string + def isascii(s): try: @@ -131,6 +136,81 @@ class Data(object): return fullpath + + +def is_valid_port(port): + if not 0 <= port <= 65535: + return False + return True + + +def is_valid_host(host): + try: + host.decode("idna") + except ValueError: + return False + if "\0" in host: + return None + return True + + +def parse_url(url): + """ + Returns a (scheme, host, port, path) tuple, or None on error. + + Checks that: + port is an integer 0-65535 + host is a valid IDNA-encoded hostname with no null-bytes + path is valid ASCII + """ + try: + scheme, netloc, path, params, query, fragment = urlparse.urlparse(url) + except ValueError: + return None + if not scheme: + return None + if '@' in netloc: + # FIXME: Consider what to do with the discarded credentials here Most + # probably we should extend the signature to return these as a separate + # value. + _, netloc = string.rsplit(netloc, '@', maxsplit=1) + if ':' in netloc: + host, port = string.rsplit(netloc, ':', maxsplit=1) + try: + port = int(port) + except ValueError: + return None + else: + host = netloc + if scheme == "https": + port = 443 + else: + port = 80 + path = urlparse.urlunparse(('', '', path, params, query, fragment)) + if not path.startswith("/"): + path = "/" + path + if not is_valid_host(host): + return None + if not isascii(path): + return None + if not is_valid_port(port): + return None + return scheme, host, port, path + + +def get_header_tokens(headers, key): + """ + Retrieve all tokens for a header key. A number of different headers + follow a pattern where each header line can containe comma-separated + tokens, and headers can be set multiple times. + """ + toks = [] + for i in headers[key]: + for j in i.split(","): + toks.append(j.strip()) + return toks + + def hostport(scheme, host, port): """ Returns the host component, with a port specifcation if needed. @@ -139,3 +219,23 @@ def hostport(scheme, host, port): return host else: return "%s:%s" % (host, port) + +def unparse_url(scheme, host, port, path=""): + """ + Returns a URL string, constructed from the specified compnents. + """ + return "%s://%s%s" % (scheme, hostport(scheme, host, port), path) + + +def urlencode(s): + """ + Takes a list of (key, value) tuples and returns a urlencoded string. + """ + s = [tuple(i) for i in s] + return urllib.urlencode(s, False) + +def urldecode(s): + """ + Takes a urlencoded string and returns a list of (key, value) tuples. + """ + return cgi.parse_qsl(s, keep_blank_values=True) |