Source code for pcapng_utils.tshark.protocols.http1

  1import logging
  2from http import HTTPMethod
  3from abc import ABC, abstractmethod
  4from functools import cached_property
  5from dataclasses import dataclass
  6from collections import defaultdict
  7from collections.abc import Sequence
  8from typing import ClassVar, Any
  9
 10from ...payload import Payload
 11from ..layers import FrameMixin, TCPIPMixin, CommunityIDMixin, get_protocols, get_layers_mapping, get_har_communication
 12from ..types import HarEntry, DictLayers
 13from ..utils import get_tshark_bytes_from_raw, har_entry_with_common_fields
 14from .websocket import WebSocketConversation, WebSocketMessagesInNetworkFramePossiblyIncomplete, is_websocket_conversation
 15
 16
 17LOGGER = logging.getLogger(__name__)
 18
 19HTTP_METHODS = {str(v) for v in HTTPMethod}
 20
 21
 22def _get_raw_headers(http_layer: dict[str, Any], direction: str) -> list[bytes]:
 23    raw_headers = http_layer.get(f"http.{direction}.line_raw")
 24    if not raw_headers:
 25        return []
 26    if isinstance(http_layer[f"http.{direction}.line"], str):  # only 1 header (dirty structure)
 27        raw_headers = [raw_headers]
 28    return [get_tshark_bytes_from_raw(h) for h in raw_headers]
 29
 30
[docs] 31@dataclass(frozen=True) 32class HttpRequestResponse(ABC, FrameMixin, TCPIPMixin, CommunityIDMixin): 33 """ 34 Base class for HTTP request and response packets. It wraps the layers data and provides methods to 35 access the relevant information. 36 """ 37 layers: DictLayers 38 39 FALLBACK_CONTENT_TYPE: ClassVar[str] = 'application/octet-stream' 40 41 @property 42 def http_layer(self) -> dict[str, Any]: 43 http_layer = self.layers['http'] 44 assert isinstance(http_layer, dict), self 45 return http_layer 46 47 @property 48 @abstractmethod 49 def raw_headers(self) -> Sequence[bytes]: 50 pass 51 52 @property 53 def header_length(self) -> int: 54 return len(b''.join(self.raw_headers)) 55 56 @property 57 def content_type(self) -> str: 58 if not self.payload: 59 return '' 60 content_type: str | list[str] = self.http_layer.get('http.content_type', self.FALLBACK_CONTENT_TYPE) 61 if isinstance(content_type, list): 62 content_type = content_type[-1] # we take last value when multiple values 63 return content_type 64
[docs] 65 @cached_property 66 def payload(self) -> Payload: 67 raw_data = self.http_layer.get('http.file_data_raw') 68 if raw_data is None: 69 # handle tshark error during decompression 70 for k, v in self.http_layer.items(): 71 if k.lower().startswith('content-encoded entity body ') and isinstance(v, dict): 72 raw_data = v['data_raw'] 73 break 74 return Payload(get_tshark_bytes_from_raw(raw_data))
75 76 @property 77 def content_length(self) -> int: 78 return self.payload.size 79
[docs] 80 @cached_property 81 def headers(self) -> list[dict[str, str]]: 82 assert isinstance(self.raw_headers, list), self.raw_headers 83 processed_headers = [] 84 for header in self.raw_headers: 85 key_value = header.decode().split(':', 1) # on rare occasions there is no space after colon 86 assert len(key_value) == 2, key_value 87 key, value = key_value 88 processed_headers.append({ 89 'name': key.strip(), 90 'value': value.strip(), 91 }) 92 return processed_headers
93 94 @property 95 def common_har_props(self) -> dict[str, Any]: 96 return { 97 'cookies': [], # TODO? 98 'headers': self.headers, 99 'headersSize': self.header_length, 100 'bodySize': self.content_length, 101 '_timestamp': self.timestamp, 102 '_rawFramesNumbers': [self.frame_nb], # always 1 frame in HTTP1 103 '_communication': get_har_communication(self), 104 }
105 106
[docs] 107@dataclass(frozen=True) 108class HttpRequest(HttpRequestResponse): 109 """ 110 Class to represent an HTTP request. 111 """ 112 @property 113 def raw_headers(self) -> list[bytes]: 114 return _get_raw_headers(self.http_layer, 'request') 115 116 @property 117 def response_frame_nb(self) -> int: 118 return int(self.http_layer['http.response_in']) 119
[docs] 120 @cached_property 121 def http_version_method(self) -> tuple[str, str]: 122 """ 123 Get the HTTP version & method from the packet data. 124 :return: tuple with HTTP version & method 125 """ 126 for d in self.http_layer.values(): 127 if not isinstance(d, dict) or 'http.request.version' not in d: 128 continue 129 version = d['http.request.version'] 130 assert version.startswith('HTTP/1.'), version 131 meth = d['http.request.method'] 132 assert meth in HTTP_METHODS, meth 133 return version, meth 134 return 'HTTP/1.1', ''
135 136 @property 137 def sending_duration(self) -> float: 138 return round(1000 * float(self.layers['frame'].get('frame.time_delta', 0)), 2) 139
[docs] 140 def to_har(self) -> dict[str, Any]: 141 """ 142 Convert the HTTP request to HTTP Archive (HAR) format. 143 :return: the HTTP request in HAR format 144 """ 145 http_version, method = self.http_version_method 146 d = { 147 'method': method, 148 'url': self.uri, 149 'queryString': [], 150 'httpVersion': http_version, 151 **self.common_har_props, 152 } 153 if self.content_length: 154 self.payload.update_har_request(d, self.content_type) 155 return d
156 157 @property 158 def uri(self) -> str: 159 return self.http_layer['http.request.full_uri']
160 161
[docs] 162@dataclass(frozen=True) 163class HttpResponse(HttpRequestResponse): 164 """ 165 Class to represent an HTTP response. 166 """ 167 @property 168 def raw_headers(self) -> list[bytes]: 169 return _get_raw_headers(self.http_layer, 'response') 170 171 @property 172 def request_frame_nb(self) -> int: 173 return int(self.http_layer['http.request_in']) 174
[docs] 175 @cached_property 176 def http_version_status_code_message(self) -> tuple[str, int, str]: 177 """ 178 Retrieve the HTTP version & status code & message. 179 :return: tuple with HTTP version, status code and message 180 """ 181 for d in self.http_layer.values(): 182 if not isinstance(d, dict) or 'http.response.version' not in d: 183 continue 184 version = d['http.response.version'] 185 assert version.startswith('HTTP/1.'), version 186 return version, int(d['http.response.code']), d['http.response.code.desc'] 187 return 'HTTP/1.1', 0, ''
188
[docs] 189 def to_har(self): 190 """ 191 Convert the HTTP response to HTTP Archive (HAR) format. 192 :return: the HTTP response in HAR format 193 """ 194 http_version, status_code, status_message = self.http_version_status_code_message 195 d = { 196 'status': status_code, 197 'statusText': status_message, 198 'redirectURL': '', 199 'httpVersion': http_version, 200 **self.common_har_props, 201 } 202 self.payload.update_har_response(d, self.content_type) 203 return d
204 205 @property 206 def receiving_duration(self) -> float: 207 return round(1000 * float(self.http_layer.get('http.time', 0)), 2)
208 209
[docs] 210class HttpConversation: 211 """ 212 Class to represent an HTTP conversation composed of a request and a response. 213 214 If this HTTP conversation is a websocket handshake then it shall also contain the websocket conversation. 215 """ 216 def __init__(self, request: HttpRequest, response: HttpResponse): 217 self.request = request 218 self.response = response 219 self.websocket_conversation = ( 220 WebSocketConversation(request.src_dst_ip_port) 221 if is_websocket_conversation( 222 request.http_layer, 223 response.http_layer, 224 response_code=response.http_version_status_code_message[1], 225 ) 226 else None 227 ) 228 229 @property 230 def tcp_stream_id(self) -> int: 231 sid = self.request.tcp_stream_id 232 try: 233 assert sid == self.response.tcp_stream_id, (sid, self.response.tcp_stream_id) 234 except KeyError: # buggy/incomplete response may not have `tcp_stream` but OK 235 pass 236 return sid 237 238 @property 239 def community_id(self) -> str: 240 cid = self.request.community_id 241 try: 242 assert cid == self.response.community_id, (cid, self.response.community_id) 243 except KeyError: # buggy/incomplete response may not have `community_id` but OK 244 pass 245 return cid 246 247 @property 248 def waiting_duration(self) -> float: 249 return round(1000 * (self.response.timestamp - self.request.timestamp), 2) 250
[docs] 251 def to_har(self) -> dict[str, Any]: 252 """ 253 Convert the HTTP conversation to HTTP Archive (HAR) format. 254 :return: the HTTP conversation (request and response) in HAR format 255 """ 256 return har_entry_with_common_fields({ 257 '_timestamp': self.request.timestamp, 258 'timings': { 259 'send': self.request.sending_duration, 260 'wait': self.waiting_duration, 261 'receive': self.response.receiving_duration 262 }, 263 'serverIPAddress': self.request.dst_ip, 264 '_communityId': self.community_id, 265 'request': self.request.to_har(), 266 'response': self.response.to_har(), 267 **( 268 self.websocket_conversation.to_har() 269 if self.websocket_conversation is not None 270 else {} 271 ), 272 })
273 274 275DELTA_MS_ORPHANS_AFTER_PENALTY = 50.0 276DELTA_MS_ORPHANS_WINDOW_WARN = (-250.0, 50.0) 277DELTA_MS_ORPHANS_WINDOW_IGNORE = (-2500.0, 500.0) 278 279
[docs] 280class Http1Traffic: 281 """ 282 Class to represent HTTP1 network traffic. 283 284 This class is the entry point for parsing HTTP1 network traffic. 285 286 The format of JSON data from tshark is as follows for a single HTTP request: 287 288 - `GET /spi/v2/platforms/ HTTP/1.1\\r\\n`: Contains the HTTP method, URI, and version. 289 - `http.request.version`: The HTTP version used. 290 - `http.request.line`: A list of HTTP headers sent with the request. 291 - `http.host`: The Host header value. 292 - `http.request.full_uri`: The full URI including the scheme (e.g., https). 293 - `http.request_number`: The request number. 294 - `http.response_in`: The response number associated with this request. 295 296 The format of JSON data from tshark is as follows for a single HTTP response: 297 298 - `HTTP/1.1 200 OK\\r\\n`: Contains the HTTP version, status code, and status phrase. 299 - `http.content_type`: The Content-Type header value. 300 - `http.response.line`: A list of HTTP headers sent with the response. 301 - `http.content_encoding`: The Content-Encoding header value. 302 - `http.response_number`: The response number. 303 - `http.time`: The time taken for the response. 304 - `http.request_in`: The request number associated with this response. 305 - `http.response_for.uri`: The URI for which this response is generated. 306 - `http.file_data_raw`: The data in hexadecimal format (requires -x flag). 307 """ 308 def __init__(self, traffic: Sequence[DictLayers]): 309 self.traffic = traffic 310 self.conversations: list[HttpConversation] = [] 311 self.parse_traffic() 312
[docs] 313 def parse_traffic(self) -> None: 314 """ 315 Parse the HTTP network traffic and extract the request-response pairs. 316 317 Identify each HTTP request and its associated HTTP response by following these steps: 318 319 1. Iterate through packets: It loops through all packets obtained from the `traffic` object. 320 2. Check protocols: It checks if the packet contains the `http` protocol by examining the `frame.protocols` 321 field. 322 3.a. If traffic correspond to websocket, try to bind it to the originating HTTP conversation 323 3.b. Otherwise, we identify http requests by checking if the packet contains the `http.request`. 324 4. Find associated response: If the packet is an HTTP request and contains the `http.response_in` key, it 325 retrieves the corresponding response packet using response number and the `layers_mapping`, otherwise 326 it will handle it later with orphan responses logic. 327 5. Create conversation: It creates an `HttpConversation` object with the request and response packets and 328 appends it to the `conversations` list. 329 """ 330 layers_mapping = get_layers_mapping( 331 # discard non-http traffic 332 [layers for layers in self.traffic if 'http' in get_protocols(layers)] 333 ) 334 websocket_conversations_per_tcp_stream_id = defaultdict[int, list[WebSocketConversation]](list) 335 orphan_requests_per_tcp_stream = defaultdict[int, list[HttpRequest]](list) 336 response_nb_blacklist = set[int]() 337 338 for layers in layers_mapping.values(): 339 if 'websocket' in layers: 340 ws_frames = WebSocketMessagesInNetworkFramePossiblyIncomplete(layers) # type: ignore[arg-type] 341 ws_convs = websocket_conversations_per_tcp_stream_id[ws_frames.tcp_stream_id] 342 assert ws_convs, (ws_frames.tcp_stream_id, ws_frames) 343 ws_convs[-1].push(ws_frames) 344 continue 345 if 'http' not in layers: 346 # happens that both 'http' & 'http2' are in `protocols` but only 'http2' in actual layers 347 continue 348 # we only retain HTTP requests from now on 349 request_http_layer: dict[str, Any] | list[dict[str, Any]] = layers['http'] 350 if isinstance(request_http_layer, list): # very rare but may happen (simultaneous requests) 351 assert all('http.request' in req_http_i for req_http_i in request_http_layer), layers 352 requests = [HttpRequest(dict(layers) | {'http': req_http_i}) for req_http_i in request_http_layer] 353 elif 'http.request' not in request_http_layer: 354 continue 355 else: 356 requests = [HttpRequest(layers)] 357 for i, request in enumerate(requests): 358 try: 359 response_nb = request.response_frame_nb 360 except KeyError: 361 orphan_requests_per_tcp_stream[request.tcp_stream_id].append(request) 362 continue 363 if response_nb in response_nb_blacklist: 364 assert i > 0, layers # tshark may get confused when having multiple http layers in same frame (like us) 365 LOGGER.warning(f"Ambiguous response #{response_nb} due to multiple HTTP1 requests in same frame #{request.frame_nb}") 366 orphan_requests_per_tcp_stream[request.tcp_stream_id].append(request) 367 continue 368 response_nb_blacklist.add(response_nb) 369 http_conversation = HttpConversation(request, HttpResponse(layers_mapping[response_nb])) 370 self.conversations.append(http_conversation) 371 # handle websocket conversations if needed 372 if http_conversation.websocket_conversation is not None: 373 ws_convs_for_cur_tcp_stream = websocket_conversations_per_tcp_stream_id[http_conversation.tcp_stream_id] 374 open_ws_convs_for_cur_tcp_stream = [ws_conv for ws_conv in ws_convs_for_cur_tcp_stream if not ws_conv.is_closed] 375 if open_ws_convs_for_cur_tcp_stream: 376 raise NotImplementedError( 377 "There are still some opened WebSocket conversations " 378 f"for TCP stream #{http_conversation.tcp_stream_id}: {open_ws_convs_for_cur_tcp_stream}" 379 ) 380 ws_convs_for_cur_tcp_stream.append(http_conversation.websocket_conversation) 381 382 # try to match orphan responses with orphan requests (esp. for '206 Partial content' responses) 383 for response_nb, response_layers in layers_mapping.items(): 384 response_http_layer = response_layers.get('http') # NOT a list for responses due to earlier check in requests phase 385 if response_nb in response_nb_blacklist or not (response_http_layer and 'http.response' in response_http_layer): 386 continue 387 response = HttpResponse(response_layers) 388 existing_orphan_requests = orphan_requests_per_tcp_stream.get(response.tcp_stream_id, []) 389 possible_requests = sorted([ 390 (req_ix, req.frame_nb, delta_ms) 391 for req_ix, req in enumerate(existing_orphan_requests) 392 if DELTA_MS_ORPHANS_WINDOW_IGNORE[0] < (delta_ms := (req.timestamp - response.timestamp) * 1000) < DELTA_MS_ORPHANS_WINDOW_IGNORE[1] 393 ], key=lambda tup: abs(tup[-1]) + DELTA_MS_ORPHANS_AFTER_PENALTY*(0 if tup[-1] <= 0 else 1)) 394 _, resp_status_code, _ = response.http_version_status_code_message 395 resp_lbl = f"HTTP1 response (Frame #{response_nb}, TCP stream #{response.tcp_stream_id}, Code {resp_status_code})" 396 if not possible_requests: 397 # TODO? totally skip pairing for 1xx responses? 398 (LOGGER.info if resp_status_code in {100, 102} else LOGGER.warning)( 399 f"Orphan {resp_lbl} did not match with any orphan HTTP1 request" 400 ) 401 continue 402 if len(possible_requests) > 1: 403 LOGGER.debug( 404 f"Ambiguous matching of orphan {resp_lbl} with possible orphan requests {[f'#{req_nb}' for _, req_nb, _ in possible_requests]}" 405 ) 406 req_ix, req_nb, delta_ms = possible_requests[0] # first is best (sorted) 407 request = existing_orphan_requests.pop(req_ix) # this request is not orphan anymore 408 if not (DELTA_MS_ORPHANS_WINDOW_WARN[0] < delta_ms < DELTA_MS_ORPHANS_WINDOW_WARN[1]): 409 LOGGER.warning(f"Dubious matching of orphan {resp_lbl} with orphan request #{req_nb}") 410 411 http_conv = HttpConversation(request, response) 412 self.conversations.append(http_conv) 413 414 # log any orphan requests remaining 415 for tcp_stream_id, orphan_requests_for_tcp_stream in orphan_requests_per_tcp_stream.items(): 416 if orphan_requests_for_tcp_stream: 417 reqs_lbls = [ 418 f"Frame #{req.frame_nb}: {' '.join(req.http_version_method)} {req.uri}" 419 for req in orphan_requests_for_tcp_stream 420 ] 421 LOGGER.warning( 422 f"TCP stream #{tcp_stream_id}: some orphan HTTP1 requests remain: {reqs_lbls}" 423 )
424
[docs] 425 def get_har_entries(self) -> list[HarEntry]: 426 """ 427 Convert the HTTP network traffic to HTTP Archive (HAR) format. 428 :return: the HTTP network traffic in HAR format 429 """ 430 entries = [] 431 for http_conversation in self.conversations: 432 entries.append(http_conversation.to_har()) 433 return entries