Source code for pcapng_utils.tshark.protocols.http2

  1import warnings
  2from functools import cached_property
  3from collections.abc import Set, Sequence, Mapping
  4from typing import ClassVar, Optional, Any
  5
  6from ...payload import Payload
  7from ..types import HarEntry, DictLayers, NameValueDict
  8from ..layers import FrameMixin, TCPIPMixin, get_protocols, get_har_communication, get_tcp_stream_id, get_community_id
  9from ..utils import get_tshark_bytes_from_raw, har_entry_with_common_fields
 10
 11
[docs] 12class Http2Substream(FrameMixin, TCPIPMixin): 13 """ 14 Class to represent a HTTP2 substream. 15 16 It wraps the raw HTTP2 substream and the parent layers to extract the relevant information. 17 """ 18 KEEP_LAYERS: ClassVar[Set[str]] = {'frame', 'ip', 'ipv6', 'tcp'} 19 20 def __init__(self, raw_http2_substream: Mapping[str, Any], parent_layers: DictLayers): 21 self.layers: DictLayers = { 22 layer_name: layer_data 23 for layer_name, layer_data in parent_layers.items() 24 if layer_name in self.KEEP_LAYERS 25 } 26 self.raw_http2_substream = raw_http2_substream 27 28 @property 29 def http2_flags(self) -> int: 30 return int(self.raw_http2_substream.get('http2.flags', '0x0'), 0) 31 32 @property 33 def http2_type(self) -> int: 34 return int(self.raw_http2_substream.get('http2.type', -1)) 35 36 @property 37 def raw_headers(self) -> list[dict[str, Any]]: 38 headers = self.raw_http2_substream.get('http2.header', []) 39 if isinstance(headers, dict): 40 headers = [headers] # when only 1 header tshark does not wrap it into a list 41 assert isinstance(headers, list), headers 42 return headers
43 44
[docs] 45class Http2RequestResponse: 46 """ 47 Base class to represent a HTTP2 request or response. It contains the headers and data of the request or response. 48 Implements the common properties of a HTTP2 request or response. 49 """ 50 FALLBACK_CONTENT_TYPE: ClassVar[str] = 'application/octet-stream' 51 52 def __init__(self, substreams: Sequence[Http2Substream]): 53 self.substreams = substreams 54 self.headers, self.data, self.headers_streams, self.data_streams = Http2Helper.get_headers_and_data(substreams) 55 56 def __bool__(self) -> bool: 57 return bool(self.substreams) 58 59 @property 60 def frames_nbs(self) -> Sequence[int]: 61 # ordered set of frames numbers 62 return list({s.frame_nb: 0 for s in self.substreams}) 63 64 @property 65 def timestamp(self) -> float: 66 return self.substreams[0].timestamp 67 68 @property 69 def src_host(self) -> str: 70 return self.substreams[0].src_host 71 72 @property 73 def dst_host(self) -> str: 74 return self.substreams[0].dst_host 75 76 @property 77 def src_ip(self) -> str: 78 return self.substreams[0].src_ip 79 80 @property 81 def dst_ip(self) -> str: 82 return self.substreams[0].dst_ip 83 84 @property 85 def src_port(self) -> int: 86 return self.substreams[0].src_port 87 88 @property 89 def dst_port(self) -> int: 90 return self.substreams[0].dst_port 91 92 @property 93 def http_version(self) -> str: 94 return 'HTTP/2' 95 96 @property 97 def header_length(self) -> int: 98 # The effective payload sent over network has bytes size `http2.length` <= `http2.headers.length` 99 # (because special headers - like `:status` - have predefined codes) 100 if not self: 101 return -1 102 return sum(int(s.raw_http2_substream.get('http2.length', 0)) for s in self.headers_streams) 103 104 @property 105 def body_length(self) -> int: 106 """ 107 This is number of compressed bytes (if any compression) 108 109 - `http2.length` is also populated for header substreams 110 - we do NOT always have the `http2.body.fragments` -> `http2.body.reassembled.length` 111 """ 112 if not self: 113 return -1 114 declared_size = sum(int(s.raw_http2_substream.get('http2.length', 0)) for s in self.data_streams) 115 if declared_size != self.data.size and self.headers_map.get('content-encoding', 'identity') == 'identity': 116 warnings.warn( 117 f"Content length mismatch despite no compression: " 118 f"declared ({declared_size}) != computed ({self.data.size})" 119 f"\n{self}" 120 ) 121 return declared_size 122
[docs] 123 @cached_property 124 def headers_map(self) -> dict[str, str]: 125 # <!> only last header value is taken into account if there are some collisions 126 return { 127 h['name'].lower(): h['value'] 128 for h in self.headers 129 }
130 131 @property 132 def http_status(self) -> int: 133 return int(self.headers_map.get(':status', 0)) 134 135 @property 136 def http_method(self) -> str: 137 return self.headers_map.get(':method', '') 138 139 @property 140 def content_type(self) -> str: 141 if not self or not self.data: 142 return '' 143 return self.headers_map.get('content-type', self.FALLBACK_CONTENT_TYPE) 144
[docs] 145 def get_duration_ms(self) -> float: 146 if not self: 147 return -1 148 return round(1000 * (self.substreams[-1].timestamp - self.substreams[0].timestamp), 2)
149 150
[docs] 151class Http2Request(Http2RequestResponse): 152 """ 153 Class to represent a HTTP2 request. It contains the headers and data of the request. 154 """ 155 def __init__(self, substreams: Sequence[Http2Substream]): 156 assert substreams, "At least one substream expected for a request" 157 super().__init__(substreams) 158 159 @property 160 def uri(self) -> str: 161 uris = {s.raw_http2_substream['http2.request.full_uri'] for s in self.headers_streams} 162 assert len(uris) == 1, uris 163 return next(iter(uris)) 164 165 def __str__(self): 166 return ( 167 f"Request [#{','.join(map(str, self.frames_nbs))}]: {len(self.headers_streams)}h + {len(self.data_streams)}d substreams\n\t" 168 f"URI: {self.uri}\n\tHeaders: {self.headers_map}\n\tData: {self.data}" 169 )
170 171
[docs] 172class Http2Response(Http2RequestResponse): 173 """ 174 Class to represent a HTTP2 response. It contains the headers and data of the response. 175 176 <!> May be empty for convenience (response never received) 177 """ 178 def __str__(self): 179 return ( 180 f"Response [#{','.join(map(str, self.frames_nbs))}]: {len(self.headers_streams)}h + {len(self.data_streams)}d substreams\n\t" 181 f"Headers: {self.headers_map}\n\tData: {self.data}" 182 )
183 184
[docs] 185class Http2Stream: 186 """ 187 Class to represent an entire HTTP2 stream (multiple substreams). It contains the request and response objects. 188 Http2Stream represents a single HTTP2 stream that can contain multiple substreams as follows: 189 190 .. code-block:: 191 192 +-------------------------------------- (tcp stream, http2 stream) 193 | Http2SubStream 1 | Request headers (type: 1) 194 | Http2SubStream ... | Request data (type: 0, flags: 0x0) - partial data 195 | Http2SubStream 3 | Request data (type: 0, flags: 0x1) - end of stream, contains reassembled data 196 | (Http2SubStream 4 | Request trailers (type: 1)) 197 +-------------------------------------- 198 | Http2SubStream 5 | Response headers (type: 1) 199 | Http2SubStream ... | Response data (type: 0, flags: 0x0) - partial data 200 | Http2SubStream 7 | Response data (type: 0, flags: 0x1) - end of stream, contains reassembled data 201 | (Http2SubStream 8 | Response trailers (type: 1)) 202 +-------------------------------------- 203 204 Each HTTP2 stream is uniquely identified by a tuple (tcp stream index, http2 stream index) 205 and contains both request and response objects. 206 """
[docs] 207 def __init__(self, tcp_stream_id: int, http2_stream_id: int, community_id: str): 208 """ 209 Defines a HTTP2 stream for the given TCP stream and HTTP2 stream. 210 211 :param tcp_stream_id: the ID of the TCP stream 212 :param http2_stream_id: the ID of the HTTP2 stream 213 :param community_id: the community ID (i.e. TCP|UDP + ips & ports) for this conversation 214 """ 215 self.tcp_stream_id = tcp_stream_id 216 self.http2_stream_id = http2_stream_id 217 self.community_id = community_id 218 self.request: Optional[Http2Request] = None 219 self.response: Optional[Http2Response] = None 220 self.substreams: list[Http2Substream] = []
221 222 @property 223 def id(self) -> tuple[int, int]: 224 return (self.tcp_stream_id, self.http2_stream_id) 225
[docs] 226 def append(self, raw_http2_substream: Mapping[str, Any], parent_layers: DictLayers) -> None: 227 """ 228 Append a new substream to the HTTP2 stream. 229 230 :param substream: the substream to be added 231 :param parent_layers: all layers of the frame containing the substream (a frame can contain multiple substreams) 232 """ 233 self.substreams.append(Http2Substream(raw_http2_substream, parent_layers))
234 235 @property 236 def waiting_duration(self) -> float: 237 if not self.response: 238 return 0 239 assert self.request, self.id 240 start_stream = self.request.substreams[-1] 241 resp_stream = self.response.substreams[0] 242 return round(1000 * (resp_stream.timestamp - start_stream.timestamp), 2) 243
[docs] 244 def har_entry(self) -> Optional[dict[str, Any]]: 245 """ 246 Create a HAR entry for the HTTP2 stream. It contains the request and response objects. 247 248 :return: the HAR entry for the HTTP2 stream 249 """ 250 assert self.request is not None, self.id 251 assert self.response is not None, self.id 252 if not self.request: 253 assert not self.response, self.id 254 return None 255 first_stream = self.request.headers_streams[0] 256 return har_entry_with_common_fields({ 257 '_timestamp': first_stream.timestamp, 258 'timings': { 259 'send': self.request.get_duration_ms(), 260 'wait': self.waiting_duration, 261 'receive': self.response.get_duration_ms(), 262 }, 263 'serverIPAddress': first_stream.dst_ip, 264 '_communityId': self.community_id, 265 'request': Http2Helper.to_har(self.request), 266 'response': Http2Helper.to_har(self.response), 267 })
268 269 @staticmethod 270 def _get_raw_data_one_substream(raw_http2_substream: Mapping[str, Any]) -> Payload: 271 """ 272 Note: 273 - when dealing with a reassembled data substream, `http2.data.data_raw` MAY not contain all data 274 - if the payload was compressed, tshark decompresses ALL data for us(even if data is reassembled) 275 under `Content-encoded entity body ...` -> `http2.data.data_raw` key, so we check it first 276 """ 277 for k, v in raw_http2_substream.items(): 278 if k.lower().startswith('content-encoded entity body '): 279 assert isinstance(v, dict), (k, v) 280 if 'http2.data.data_raw' not in v: 281 if 'data_raw' in v: # special case for failed decompression (not observed but as http protocol?!) 282 return Payload(get_tshark_bytes_from_raw(v['data_raw'])) 283 # also happens in special case of empty decompressed payload (observed) 284 assert v['http2.data.data'] == '', v 285 return Payload(get_tshark_bytes_from_raw(v.get('http2.data.data_raw'))) 286 if 'http2.body.fragments' in raw_http2_substream: 287 return Payload(get_tshark_bytes_from_raw(raw_http2_substream['http2.body.fragments']['http2.body.reassembled.data_raw'])) 288 return Payload(get_tshark_bytes_from_raw(raw_http2_substream.get('http2.data.data_raw'))) 289
[docs] 290 @classmethod 291 def get_raw_data(cls, raw_http2_substreams: Sequence[Mapping[str, Any]]) -> Payload: 292 """ 293 Find the data in the substreams. 294 295 :param raw_http2_substreams: the data substreams to be analyzed 296 :return: the raw reassembled data if it exists, otherwise an empty Payload 297 """ 298 # 1) search for the unique substream with reassembled data if present 299 substreams_reassembled = { 300 ix: raw_http2_substream for ix, raw_http2_substream in enumerate(raw_http2_substreams) 301 if 'http2.body.fragments' in raw_http2_substream 302 } 303 if substreams_reassembled: 304 # should be unique and for last data substream (on rare cases: != at end of stream) 305 assert len(substreams_reassembled) == 1, substreams_reassembled 306 ix_reassembled, substream_reassembled = next(iter(substreams_reassembled.items())) 307 # assert substream_reassembled['http2.flags'] & 0x01, substream_reassembled 308 assert ix_reassembled == len(raw_http2_substreams) - 1, raw_http2_substreams 309 return cls._get_raw_data_one_substream(substream_reassembled) 310 # 2) if there is none (which happens) we manually concatenate fragments 311 # <!> decompression for overall content is NOT implemented (should not happen?!) 312 return Payload.concat(*(cls._get_raw_data_one_substream(ss) for ss in raw_http2_substreams))
313
[docs] 314 def process(self) -> None: 315 """ 316 Process the substreams and create the request and response objects accordingly. Substreams are processed in 317 order, the first substreams are request headers, followed by request data, and finally the response headers and 318 data. The reassembled data is used to create the request and response objects. 319 320 Request substreams are identified by the presence of the 'http2.request.full_uri' key in the raw stream. 321 If no response substream is found, the request object is created with the first substreams. 322 323 It retrieves the source and destination IP addresses from the first substream to identify the substreams that 324 belong to the request. The response substreams are identified by checking their source IP address matches 325 the destination IP address of the first substream. 326 """ 327 assert self.substreams, self.id 328 329 # Find a request frame and its associated IPs 330 src, dst = None, None 331 for substream in self.substreams: 332 if 'http2.request.full_uri' in substream.raw_http2_substream: # This is a request 333 src, dst = substream.src_ip_port, substream.dst_ip_port 334 break 335 assert src and dst, self.substreams 336 assert src != dst, src 337 338 # Create the request and response objects with their associated substreams 339 req_substreams = [substream for substream in self.substreams if substream.src_ip_port == src] 340 resp_substreams = [substream for substream in self.substreams if substream.src_ip_port == dst] 341 assert len(req_substreams) + len(resp_substreams) == len(self.substreams), self.substreams 342 self.request = Http2Request(req_substreams) 343 self.response = Http2Response(resp_substreams) # may be empty
344 345 def __str__(self): 346 return ( 347 f'TCP Stream: {self.tcp_stream_id}, ' 348 f'HTTP2 Stream: {self.http2_stream_id}' 349 f'\n{self.request}' 350 f'\n{self.response}' 351 )
352 353
[docs] 354class Http2Helper: 355
[docs] 356 @staticmethod 357 def substream_is_header(substream: Http2Substream) -> bool: 358 """Returns whether substream is a header substream.""" 359 stream_type = substream.http2_type 360 return stream_type == 1
361
[docs] 362 @staticmethod 363 def substream_is_data(substream: Http2Substream) -> bool: 364 """Returns whether substream is a data substream.""" 365 stream_type = substream.http2_type 366 return stream_type == 0
367
[docs] 368 @staticmethod 369 def get_headers(substream: Http2Substream) -> list[NameValueDict]: 370 """ 371 Extract the headers from the substream (precondition: it is a header substream). 372 373 :param substream: the substream to be analyzed 374 :return: the headers of the substream 375 """ 376 headers: list[NameValueDict] = [] 377 for header in substream.raw_headers: 378 # cope for non-ASCII headers 379 try: 380 h_name = get_tshark_bytes_from_raw(header['http2.header.name_raw']).decode() 381 h_value = get_tshark_bytes_from_raw(header.get('http2.header.value_raw')).decode() 382 headers.append({ 383 'name': h_name.strip(), 384 'value': h_value.strip(), 385 }) 386 except Exception as e: 387 e.add_note(f"{header=}") 388 raise 389 return headers
390
[docs] 391 @staticmethod 392 def to_har(message: Http2RequestResponse) -> dict[str, Any]: 393 """ 394 Convert the HTTP2 request or response to a HAR entry. 395 396 <!> Some HTTP2 responses are missing 397 398 :param message: the HTTP2 request or response to be converted 399 :return: the HAR entry for the HTTP2 request or response 400 """ 401 entry = { 402 '_timestamp': message.timestamp if message else None, 403 '_rawFramesNumbers': message.frames_nbs, 404 'httpVersion': message.http_version, 405 'cookies': [], 406 'headers': message.headers, 407 'headersSize': message.header_length, 408 'bodySize': message.body_length, 409 } 410 if message: 411 entry['_communication'] = get_har_communication(message) 412 if isinstance(message, Http2Request): 413 entry |= { 414 'method': message.http_method, 415 'url': message.uri, 416 'queryString': [], 417 } 418 if message.data.size: 419 message.data.update_har_request(entry, message.content_type) 420 else: 421 entry |= { 422 'status': message.http_status, 423 'statusText': '', 424 'redirectURL': '', 425 } 426 message.data.update_har_response(entry, message.content_type) 427 return entry
428
[docs] 429 @staticmethod 430 def get_data(data_substreams: Sequence[Http2Substream]) -> Payload: 431 """ 432 Extract the data from the substreams (precondition: all substreams are data substreams). 433 434 :param data_substreams: the data substreams to be analyzed 435 :return: the reassembled data 436 """ 437 return Http2Stream.get_raw_data([ss.raw_http2_substream for ss in data_substreams])
438
[docs] 439 @classmethod 440 def get_headers_and_data(cls, substreams: Sequence[Http2Substream]): 441 """ 442 Identify the headers and data substreams and return them. 443 444 The substreams are identified by their types: 445 - Headers substream: type 1 446 - Data substream: type 0 447 We ignore the rest of the substreams. 448 449 Note that (flag & 0x01) identify the end of stream, usually it happens for a data-stream 450 but it may also happen for a header-stream (trailers in gRPC), 451 or even never happen. 452 453 :param substreams: the substreams of a HTTP2 stream 454 :return: the headers and data substreams regardless if it is a request or a response 455 """ 456 headers: list[NameValueDict] = [] 457 headers_streams: list[Http2Substream] = [] 458 data_streams: list[Http2Substream] = [] 459 460 for substream in substreams: 461 # Parse headers (HTTP2 substream marked as headers) 462 if cls.substream_is_header(substream): 463 headers_streams.append(substream) 464 headers += Http2Helper.get_headers(substream) 465 # Register data substreams 466 if cls.substream_is_data(substream): 467 data_streams.append(substream) 468 469 if substreams: 470 assert headers_streams, (len(substreams), data_streams) 471 472 return headers, Http2Helper.get_data(data_streams), headers_streams, data_streams
473 474
[docs] 475class Http2Traffic: 476 """ 477 Class to represent the HTTP2 traffic. It contains the HTTP2 streams and the parsed traffic data. 478 479 In HTTP/2, frames are the smallest unit of communication. 480 Each frame has a specific type and can have associated flags. 481 482 **HTTP/2 frame types and flags:** 483 484 485 HTTP/2 Frame Types: 486 487 - `DATA (0x0)`: carries arbitrary, variable-length sequences of octets associated with a stream. 488 - `HEADERS (0x1)`: used to open a stream and carry a header block fragment. 489 - `PRIORITY (0x2)`: specifies the sender-advised priority of a stream. 490 - `RST_STREAM (0x3)`: abruptly terminates a stream. 491 - `SETTINGS (0x4)`: used to communicate configuration parameters. 492 - `PUSH_PROMISE (0x5)`: used to notify the peer endpoint in advance of streams the sender intends to initiate. 493 - `PING (0x6)`: used to measure round-trip time and ensure the connection is still active. 494 - `GOAWAY (0x7)`: informs the peer to stop creating streams on this connection. 495 - `WINDOW_UPDATE (0x8)`: used to implement flow control. 496 - `CONTINUATION (0x9)`: used to continue a sequence of header block fragments. 497 498 HTTP/2 Frame Flags: 499 500 - `END_STREAM (0x1)`: indicates that the frame is the last one for the current stream. 501 - `END_HEADERS (0x4)`: indicates that the frame contains the entire header block. 502 - `PADDED (0x8)`: indicates that the frame contains padding. 503 - `PRIORITY (0x20)`: indicates that the frame contains priority information. 504 505 **TCP stream ID and the HTTP/2 stream ID** 506 The TCP stream ID identifies a unique TCP connection. Each TCP connection is assigned a unique stream ID, 507 which is used to track the packets that belong to that connection. 508 The HTTP/2 stream ID, within a single TCP connection, multiple HTTP/2 streams can exist. Each HTTP/2 stream is 509 identified by a unique stream ID within the context of that TCP connection. These stream IDs are used to 510 multiplex multiple HTTP/2 requests and responses over a single TCP connection. 511 512 A single TCP stream (connection) can contain multiple HTTP/2 streams. Each HTTP/2 stream is 513 uniquely identified within the context of its TCP stream. The combination of the TCP stream ID and the 514 HTTP/2 stream ID uniquely identifies an HTTP/2 stream within the network traffic. 515 """ 516 def __init__(self, traffic: Sequence[DictLayers]): 517 self.traffic = traffic 518 self.stream_pairs: dict[tuple[int, int], Http2Stream] = {} 519 self.parse_traffic() 520
[docs] 521 def parse_traffic(self) -> None: 522 """ 523 Parse the traffic and extract the HTTP2 streams. It creates a dictionary for each HTTP2 stream. 524 Each key is a tuple with the TCP stream ID and the HTTP2 stream ID. 525 526 Identify each HTTP2 request and its associated HTTP2 response by following these steps: 527 528 1. Iterate through packets: it loops through all packets obtained from the `traffic` object. 529 2. Extract protocols: for each packet, it extracts the protocols from the `frame.protocols` field. 530 3. Check for HTTP2 protocol: it checks if the packet contains the `http2` protocol. 531 4. Extract the TCP stream ID: it retrieves the TCP stream ID from the `tcp.stream` field. 532 5. Handle HTTP2 layer: it ensures the `http2` layer is a list of HTTP2 stream objects. 533 6. Process each HTTP2 stream: for each HTTP2 stream in the `http2` layer: 534 535 - extract stream information: it retrieves the stream type and stream ID. 536 - filter relevant streams: it ignores streams that are not data (type 0) or headers (type 1). 537 - create or update stream pair: it creates a new tuple of `(tcp_stream_id, http2_stream_id)` if it does not 538 exist and appends the substream to the list. 539 7. Process streams: after assembling the HTTP2 streams, it processes each stream to create the request and 540 response objects. 541 """ 542 # Assemble the HTTP2 streams 543 for layers in self.traffic: 544 # Ignore non-http2 packets 545 if 'http2' not in get_protocols(layers): 546 continue 547 tcp_stream_id = get_tcp_stream_id(layers) 548 community_id = get_community_id(layers) 549 550 # HTTP2 layer can be a list of streams or a single stream, force a list 551 http2_layer: list[dict[str, Any]] = layers['http2'] 552 if not isinstance(http2_layer, list): 553 http2_layer = [layers['http2']] 554 555 for http2_layer_stream in http2_layer: 556 stream = http2_layer_stream['http2.stream'] 557 assert isinstance(stream, dict), type(stream) 558 http2_frame_type = int(stream.get('http2.type', -1)) 559 # Ignore streams that are not data or headers 560 if http2_frame_type not in {0, 1}: 561 continue 562 # <!> Edge-case: reassembled body is at top-level instead of nested in its stream 563 if 'http2.body.fragments' in http2_layer_stream: 564 assert 'http2.body.fragments' not in stream, http2_layer_stream 565 stream['http2_layer_stream'] = http2_layer_stream.pop('http2.body.fragments') 566 # Create a new tuple of (tcp_stream_id, http2_stream_id) if it does not exist 567 http2_stream_id = int(stream['http2.streamid']) 568 sid = (tcp_stream_id, http2_stream_id) 569 if sid not in self.stream_pairs: 570 self.stream_pairs[sid] = Http2Stream(*sid, community_id=community_id) 571 else: 572 assert community_id == self.stream_pairs[sid].community_id, (community_id, self.stream_pairs[sid].community_id) 573 # Append the substream to the list 574 self.stream_pairs[sid].append(stream, layers) 575 576 # Process the streams, once for all 577 for http2_stream in self.stream_pairs.values(): 578 http2_stream.process()
579
[docs] 580 def get_http2_streams(self): 581 return list(self.stream_pairs.values())
582
[docs] 583 def get_har_entries(self) -> list[HarEntry]: 584 """ 585 Convert the HTTP2 traffic to HTTP Archive (HAR) format. 586 587 :return: the HTTP2 traffic in HAR format 588 """ 589 entries = [] 590 for stream in self.get_http2_streams(): 591 har_entry = stream.har_entry() 592 if har_entry: 593 entries.append(har_entry) 594 return entries