diff options
author | Maximilian Hils <git@maximilianhils.com> | 2016-02-15 23:00:11 +0100 |
---|---|---|
committer | Maximilian Hils <git@maximilianhils.com> | 2016-02-15 23:00:11 +0100 |
commit | 87d9afcf2e257eee7c5aa08c3f0dc64da79b0647 (patch) | |
tree | 71b10729d160f0269d02548d1ef9e183be1397d9 /examples/har_extractor.py | |
parent | 36f34f701991b5d474c005ec45e3b66e20f326a8 (diff) | |
parent | 3d9a5157e77b5a3237dc62994f4e3d4c75c2066e (diff) | |
download | mitmproxy-87d9afcf2e257eee7c5aa08c3f0dc64da79b0647.tar.gz mitmproxy-87d9afcf2e257eee7c5aa08c3f0dc64da79b0647.tar.bz2 mitmproxy-87d9afcf2e257eee7c5aa08c3f0dc64da79b0647.zip |
Merge pull request #937 from mhils/single-repo
Combine mitmproxy, pathod and netlib in a single repo.
Diffstat (limited to 'examples/har_extractor.py')
-rw-r--r-- | examples/har_extractor.py | 253 |
1 files changed, 0 insertions, 253 deletions
diff --git a/examples/har_extractor.py b/examples/har_extractor.py deleted file mode 100644 index 4e905438..00000000 --- a/examples/har_extractor.py +++ /dev/null @@ -1,253 +0,0 @@ -""" - - This inline script utilizes harparser.HAR from - https://github.com/JustusW/harparser to generate a HAR log object. -""" -from harparser import HAR - -from datetime import datetime - - -class _HARLog(HAR.log): - # The attributes need to be registered here for them to actually be - # available later via self. This is due to HAREncodable linking __getattr__ - # to __getitem__. Anything that is set only in __init__ will just be added - # as key/value pair to self.__classes__. - __page_list__ = [] - __page_count__ = 0 - __page_ref__ = {} - - def __init__(self, page_list): - self.__page_list__ = page_list - self.__page_count__ = 0 - self.__page_ref__ = {} - - HAR.log.__init__(self, {"version": "1.2", - "creator": {"name": "MITMPROXY HARExtractor", - "version": "0.1", - "comment": ""}, - "pages": [], - "entries": []}) - - def reset(self): - self.__init__(self.__page_list__) - - def add(self, obj): - if isinstance(obj, HAR.pages): - self['pages'].append(obj) - if isinstance(obj, HAR.entries): - self['entries'].append(obj) - - def create_page_id(self): - self.__page_count__ += 1 - return "autopage_%s" % str(self.__page_count__) - - def set_page_ref(self, page, ref): - self.__page_ref__[page] = ref - - def get_page_ref(self, page): - return self.__page_ref__.get(page, None) - - def get_page_list(self): - return self.__page_list__ - - -def start(context, argv): - """ - On start we create a HARLog instance. You will have to adapt this to - suit your actual needs of HAR generation. As it will probably be - necessary to cluster logs by IPs or reset them from time to time. - """ - context.dump_file = None - if len(argv) > 1: - context.dump_file = argv[1] - else: - raise ValueError( - 'Usage: -s "har_extractor.py filename" ' - '(- will output to stdout, filenames ending with .zhar ' - 'will result in compressed har)' - ) - context.HARLog = _HARLog(['https://github.com']) - context.seen_server = set() - - -def response(context, flow): - """ - Called when a server response has been received. At the time of this - message both a request and a response are present and completely done. - """ - # Values are converted from float seconds to int milliseconds later. - ssl_time = -.001 - connect_time = -.001 - if flow.server_conn not in context.seen_server: - # Calculate the connect_time for this server_conn. Afterwards add it to - # seen list, in order to avoid the connect_time being present in entries - # that use an existing connection. - connect_time = flow.server_conn.timestamp_tcp_setup - \ - flow.server_conn.timestamp_start - context.seen_server.add(flow.server_conn) - - if flow.server_conn.timestamp_ssl_setup is not None: - # Get the ssl_time for this server_conn as the difference between - # the start of the successful tcp setup and the successful ssl - # setup. If no ssl setup has been made it is left as -1 since it - # doesn't apply to this connection. - ssl_time = flow.server_conn.timestamp_ssl_setup - \ - flow.server_conn.timestamp_tcp_setup - - # Calculate the raw timings from the different timestamps present in the - # request and response object. For lack of a way to measure it dns timings - # can not be calculated. The same goes for HAR blocked: MITMProxy will open - # a server connection as soon as it receives the host and port from the - # client connection. So the time spent waiting is actually spent waiting - # between request.timestamp_end and response.timestamp_start thus it - # correlates to HAR wait instead. - timings_raw = { - 'send': flow.request.timestamp_end - flow.request.timestamp_start, - 'wait': flow.response.timestamp_start - flow.request.timestamp_end, - 'receive': flow.response.timestamp_end - flow.response.timestamp_start, - 'connect': connect_time, - 'ssl': ssl_time - } - - # HAR timings are integers in ms, so we have to re-encode the raw timings to - # that format. - timings = dict([(key, int(1000 * value)) - for key, value in timings_raw.iteritems()]) - - # The full_time is the sum of all timings. Timings set to -1 will be ignored - # as per spec. - full_time = 0 - for item in timings.values(): - if item > -1: - full_time += item - - started_date_time = datetime.fromtimestamp( - flow.request.timestamp_start, - tz=utc).isoformat() - - request_query_string = [{"name": k, "value": v} - for k, v in flow.request.get_query()] - request_http_version = flow.request.http_version - # Cookies are shaped as tuples by MITMProxy. - request_cookies = [{"name": k.strip(), "value": v[0]} - for k, v in (flow.request.get_cookies() or {}).iteritems()] - request_headers = [{"name": k, "value": v} for k, v in flow.request.headers] - request_headers_size = len(str(flow.request.headers)) - request_body_size = len(flow.request.content) - - response_http_version = flow.response.http_version - # Cookies are shaped as tuples by MITMProxy. - response_cookies = [{"name": k.strip(), "value": v[0]} - for k, v in (flow.response.get_cookies() or {}).iteritems()] - response_headers = [{"name": k, "value": v} - for k, v in flow.response.headers] - response_headers_size = len(str(flow.response.headers)) - response_body_size = len(flow.response.content) - response_body_decoded_size = len(flow.response.get_decoded_content()) - response_body_compression = response_body_decoded_size - response_body_size - response_mime_type = flow.response.headers.get('Content-Type', '') - response_redirect_url = flow.response.headers.get('Location', '') - - entry = HAR.entries( - { - "startedDateTime": started_date_time, - "time": full_time, - "request": { - "method": flow.request.method, - "url": flow.request.url, - "httpVersion": request_http_version, - "cookies": request_cookies, - "headers": request_headers, - "queryString": request_query_string, - "headersSize": request_headers_size, - "bodySize": request_body_size, - }, - "response": { - "status": flow.response.status_code, - "statusText": flow.response.msg, - "httpVersion": response_http_version, - "cookies": response_cookies, - "headers": response_headers, - "content": { - "size": response_body_size, - "compression": response_body_compression, - "mimeType": response_mime_type}, - "redirectURL": response_redirect_url, - "headersSize": response_headers_size, - "bodySize": response_body_size, - }, - "cache": {}, - "timings": timings, - }) - - # If the current url is in the page list of context.HARLog or does not have - # a referrer we add it as a new pages object. - if flow.request.url in context.HARLog.get_page_list() or flow.request.headers.get( - 'Referer', - None) is None: - page_id = context.HARLog.create_page_id() - context.HARLog.add( - HAR.pages({ - "startedDateTime": entry['startedDateTime'], - "id": page_id, - "title": flow.request.url, - }) - ) - context.HARLog.set_page_ref(flow.request.url, page_id) - entry['pageref'] = page_id - - # Lookup the referer in the page_ref of context.HARLog to point this entries - # pageref attribute to the right pages object, then set it as a new - # reference to build a reference tree. - elif context.HARLog.get_page_ref(flow.request.headers.get('Referer')) is not None: - entry['pageref'] = context.HARLog.get_page_ref( - flow.request.headers['Referer'] - ) - context.HARLog.set_page_ref( - flow.request.headers['Referer'], entry['pageref'] - ) - - context.HARLog.add(entry) - - -def done(context): - """ - Called once on script shutdown, after any other events. - """ - from pprint import pprint - import json - - json_dump = context.HARLog.json() - compressed_json_dump = context.HARLog.compress() - - if context.dump_file == '-': - context.log(pprint.pformat(json.loads(json_dump))) - elif context.dump_file.endswith('.zhar'): - file(context.dump_file, "w").write(compressed_json_dump) - else: - file(context.dump_file, "w").write(json_dump) - context.log( - "HAR log finished with %s bytes (%s bytes compressed)" % ( - len(json_dump), len(compressed_json_dump) - ) - ) - context.log( - "Compression rate is %s%%" % str( - 100. * len(compressed_json_dump) / len(json_dump) - ) - ) - - -def print_attributes(obj, filter_string=None, hide_privates=False): - """ - Useful helper method to quickly get all attributes of an object and its - values. - """ - for attr in dir(obj): - if hide_privates and "__" in attr: - continue - if filter_string is not None and filter_string not in attr: - continue - value = getattr(obj, attr) - print("%s.%s" % ('obj', attr), value, type(value)) |