From 11e7f476bd4bbcd6d072fa3659f628ae3a19705d Mon Sep 17 00:00:00 2001 From: Maximilian Hils Date: Tue, 15 Sep 2015 19:12:15 +0200 Subject: wip --- netlib/utils.py | 162 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 101 insertions(+), 61 deletions(-) (limited to 'netlib/utils.py') diff --git a/netlib/utils.py b/netlib/utils.py index d6774419..fb579cac 100644 --- a/netlib/utils.py +++ b/netlib/utils.py @@ -1,17 +1,17 @@ -from __future__ import (absolute_import, print_function, division) +from __future__ import absolute_import, print_function, division import os.path -import cgi -import urllib -import urlparse -import string import re -import six +import string import unicodedata +import six + +from six.moves import urllib + -def isascii(s): +def isascii(bytes): try: - s.decode("ascii") + bytes.decode("ascii") except ValueError: return False return True @@ -44,8 +44,8 @@ def clean_bin(s, keep_spacing=True): else: keep = b"" return b"".join( - ch if (31 < ord(ch) < 127 or ch in keep) else b"." - for ch in s + six.int2byte(ch) if (31 < ch < 127 or ch in keep) else b"." + for ch in six.iterbytes(s) ) @@ -149,10 +149,7 @@ class Data(object): return fullpath -def is_valid_port(port): - if not 0 <= port <= 65535: - return False - return True +_label_valid = re.compile(b"(?!-)[A-Z\d-]{1,63}(? 255: + return False + if host[-1] == ".": + host = host[:-1] + return all(_label_valid.match(x) for x in host.split(b".")) + + +def is_valid_port(port): + return 0 <= port <= 65535 + + +# PY2 workaround +def decode_parse_result(result, enc): + if hasattr(result, "decode"): + return result.decode(enc) + else: + return urllib.parse.ParseResult(*[x.decode(enc) for x in result]) + + +# PY2 workaround +def encode_parse_result(result, enc): + if hasattr(result, "encode"): + return result.encode(enc) + else: + return urllib.parse.ParseResult(*[x.encode(enc) for x in result]) def parse_url(url): """ - Returns a (scheme, host, port, path) tuple, or None on error. + URL-parsing function that checks that + - port is an integer 0-65535 + - host is a valid IDNA-encoded hostname with no null-bytes + - path is valid ASCII - Checks that: - port is an integer 0-65535 - host is a valid IDNA-encoded hostname with no null-bytes - path is valid ASCII + Args: + A URL (as bytes or as unicode) + + Returns: + A (scheme, host, port, path) tuple + + Raises: + ValueError, if the URL is not properly formatted. """ - try: - scheme, netloc, path, params, query, fragment = urlparse.urlparse(url) - except ValueError: - return None - if not scheme: - return None - if '@' in netloc: - # FIXME: Consider what to do with the discarded credentials here Most - # probably we should extend the signature to return these as a separate - # value. - _, netloc = string.rsplit(netloc, '@', maxsplit=1) - if ':' in netloc: - host, port = string.rsplit(netloc, ':', maxsplit=1) - try: - port = int(port) - except ValueError: - return None + parsed = urllib.parse.urlparse(url) + + if not parsed.hostname: + raise ValueError("No hostname given") + + if isinstance(url, six.binary_type): + host = parsed.hostname + + # this should not raise a ValueError + decode_parse_result(parsed, "ascii") else: - host = netloc - if scheme.endswith("https"): - port = 443 - else: - port = 80 - path = urlparse.urlunparse(('', '', path, params, query, fragment)) - if not path.startswith("/"): - path = "/" + path + host = parsed.hostname.encode("idna") + parsed = encode_parse_result(parsed, "ascii") + + port = parsed.port + if not port: + port = 443 if parsed.scheme == b"https" else 80 + + full_path = urllib.parse.urlunparse( + (b"", b"", parsed.path, parsed.params, parsed.query, parsed.fragment) + ) + if not full_path.startswith(b"/"): + full_path = b"/" + full_path + if not is_valid_host(host): - return None - if not isascii(path): - return None + raise ValueError("Invalid Host") if not is_valid_port(port): - return None - return scheme, host, port, path + raise ValueError("Invalid Port") + + return parsed.scheme, host, port, full_path def get_header_tokens(headers, key): @@ -217,7 +240,7 @@ def get_header_tokens(headers, key): """ if key not in headers: return [] - tokens = headers[key].split(",") + tokens = headers[key].split(b",") return [token.strip() for token in tokens] @@ -228,7 +251,7 @@ def hostport(scheme, host, port): if (port, scheme) in [(80, "http"), (443, "https")]: return host else: - return "%s:%s" % (host, port) + return b"%s:%s" % (host, port) def unparse_url(scheme, host, port, path=""): @@ -243,14 +266,14 @@ def urlencode(s): Takes a list of (key, value) tuples and returns a urlencoded string. """ s = [tuple(i) for i in s] - return urllib.urlencode(s, False) + return urllib.parse.urlencode(s, False) def urldecode(s): """ Takes a urlencoded string and returns a list of (key, value) tuples. """ - return cgi.parse_qsl(s, keep_blank_values=True) + return urllib.parse.parse_qsl(s, keep_blank_values=True) def parse_content_type(c): @@ -267,14 +290,14 @@ def parse_content_type(c): ("text", "html", {"charset": "UTF-8"}) """ - parts = c.split(";", 1) - ts = parts[0].split("/", 1) + parts = c.split(b";", 1) + ts = parts[0].split(b"/", 1) if len(ts) != 2: return None d = {} if len(parts) == 2: - for i in parts[1].split(";"): - clause = i.split("=", 1) + for i in parts[1].split(b";"): + clause = i.split(b"=", 1) if len(clause) == 2: d[clause[0].strip()] = clause[1].strip() return ts[0].lower(), ts[1].lower(), d @@ -289,7 +312,7 @@ def multipartdecode(headers, content): v = parse_content_type(v) if not v: return [] - boundary = v[2].get("boundary") + boundary = v[2].get(b"boundary") if not boundary: return [] @@ -306,3 +329,20 @@ def multipartdecode(headers, content): r.append((key, value)) return r return [] + + +def always_bytes(unicode_or_bytes, encoding): + if isinstance(unicode_or_bytes, six.text_type): + return unicode_or_bytes.encode(encoding) + return unicode_or_bytes + + +def always_byte_args(encoding): + """Decorator that transparently encodes all arguments passed as unicode""" + def decorator(fun): + def _fun(*args, **kwargs): + args = [always_bytes(arg, encoding) for arg in args] + kwargs = {k: always_bytes(v, encoding) for k, v in six.iteritems(kwargs)} + return fun(*args, **kwargs) + return _fun + return decorator -- cgit v1.2.3 From a077d8877d210562f703c23e9625e8467c81222d Mon Sep 17 00:00:00 2001 From: Maximilian Hils Date: Wed, 16 Sep 2015 00:04:23 +0200 Subject: finish netlib.http.http1 refactor --- netlib/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'netlib/utils.py') diff --git a/netlib/utils.py b/netlib/utils.py index fb579cac..a86b8019 100644 --- a/netlib/utils.py +++ b/netlib/utils.py @@ -40,9 +40,9 @@ def clean_bin(s, keep_spacing=True): ) else: if keep_spacing: - keep = b"\n\r\t" + keep = (9, 10, 13) # \t, \n, \r, else: - keep = b"" + keep = () return b"".join( six.int2byte(ch) if (31 < ch < 127 or ch in keep) else b"." for ch in six.iterbytes(s) @@ -251,7 +251,7 @@ def hostport(scheme, host, port): if (port, scheme) in [(80, "http"), (443, "https")]: return host else: - return b"%s:%s" % (host, port) + return b"%s:%d" % (host, port) def unparse_url(scheme, host, port, path=""): -- cgit v1.2.3