1import warnings
2from functools import cached_property
3from collections.abc import Set, Sequence, Mapping
4from typing import ClassVar, Optional, Any
5
6from ...payload import Payload
7from ..types import HarEntry, DictLayers, NameValueDict
8from ..layers import FrameMixin, TCPIPMixin, get_protocols, get_har_communication, get_tcp_stream_id, get_community_id
9from ..utils import get_tshark_bytes_from_raw, har_entry_with_common_fields
10
11
[docs]
12class Http2Substream(FrameMixin, TCPIPMixin):
13 """
14 Class to represent a HTTP2 substream.
15
16 It wraps the raw HTTP2 substream and the parent layers to extract the relevant information.
17 """
18 KEEP_LAYERS: ClassVar[Set[str]] = {'frame', 'ip', 'ipv6', 'tcp'}
19
20 def __init__(self, raw_http2_substream: Mapping[str, Any], parent_layers: DictLayers):
21 self.layers: DictLayers = {
22 layer_name: layer_data
23 for layer_name, layer_data in parent_layers.items()
24 if layer_name in self.KEEP_LAYERS
25 }
26 self.raw_http2_substream = raw_http2_substream
27
28 @property
29 def http2_flags(self) -> int:
30 return int(self.raw_http2_substream.get('http2.flags', '0x0'), 0)
31
32 @property
33 def http2_type(self) -> int:
34 return int(self.raw_http2_substream.get('http2.type', -1))
35
36 @property
37 def raw_headers(self) -> list[dict[str, Any]]:
38 headers = self.raw_http2_substream.get('http2.header', [])
39 if isinstance(headers, dict):
40 headers = [headers] # when only 1 header tshark does not wrap it into a list
41 assert isinstance(headers, list), headers
42 return headers
43
44
[docs]
45class Http2RequestResponse:
46 """
47 Base class to represent a HTTP2 request or response. It contains the headers and data of the request or response.
48 Implements the common properties of a HTTP2 request or response.
49 """
50 FALLBACK_CONTENT_TYPE: ClassVar[str] = 'application/octet-stream'
51
52 def __init__(self, substreams: Sequence[Http2Substream]):
53 self.substreams = substreams
54 self.headers, self.data, self.headers_streams, self.data_streams = Http2Helper.get_headers_and_data(substreams)
55
56 def __bool__(self) -> bool:
57 return bool(self.substreams)
58
59 @property
60 def frames_nbs(self) -> Sequence[int]:
61 # ordered set of frames numbers
62 return list({s.frame_nb: 0 for s in self.substreams})
63
64 @property
65 def timestamp(self) -> float:
66 return self.substreams[0].timestamp
67
68 @property
69 def src_host(self) -> str:
70 return self.substreams[0].src_host
71
72 @property
73 def dst_host(self) -> str:
74 return self.substreams[0].dst_host
75
76 @property
77 def src_ip(self) -> str:
78 return self.substreams[0].src_ip
79
80 @property
81 def dst_ip(self) -> str:
82 return self.substreams[0].dst_ip
83
84 @property
85 def src_port(self) -> int:
86 return self.substreams[0].src_port
87
88 @property
89 def dst_port(self) -> int:
90 return self.substreams[0].dst_port
91
92 @property
93 def http_version(self) -> str:
94 return 'HTTP/2'
95
96 @property
97 def header_length(self) -> int:
98 # The effective payload sent over network has bytes size `http2.length` <= `http2.headers.length`
99 # (because special headers - like `:status` - have predefined codes)
100 if not self:
101 return -1
102 return sum(int(s.raw_http2_substream.get('http2.length', 0)) for s in self.headers_streams)
103
104 @property
105 def body_length(self) -> int:
106 """
107 This is number of compressed bytes (if any compression)
108
109 - `http2.length` is also populated for header substreams
110 - we do NOT always have the `http2.body.fragments` -> `http2.body.reassembled.length`
111 """
112 if not self:
113 return -1
114 declared_size = sum(int(s.raw_http2_substream.get('http2.length', 0)) for s in self.data_streams)
115 if declared_size != self.data.size and self.headers_map.get('content-encoding', 'identity') == 'identity':
116 warnings.warn(
117 f"Content length mismatch despite no compression: "
118 f"declared ({declared_size}) != computed ({self.data.size})"
119 f"\n{self}"
120 )
121 return declared_size
122
130
131 @property
132 def http_status(self) -> int:
133 return int(self.headers_map.get(':status', 0))
134
135 @property
136 def http_method(self) -> str:
137 return self.headers_map.get(':method', '')
138
139 @property
140 def content_type(self) -> str:
141 if not self or not self.data:
142 return ''
143 return self.headers_map.get('content-type', self.FALLBACK_CONTENT_TYPE)
144
[docs]
145 def get_duration_ms(self) -> float:
146 if not self:
147 return -1
148 return round(1000 * (self.substreams[-1].timestamp - self.substreams[0].timestamp), 2)
149
150
[docs]
151class Http2Request(Http2RequestResponse):
152 """
153 Class to represent a HTTP2 request. It contains the headers and data of the request.
154 """
155 def __init__(self, substreams: Sequence[Http2Substream]):
156 assert substreams, "At least one substream expected for a request"
157 super().__init__(substreams)
158
159 @property
160 def uri(self) -> str:
161 uris = {s.raw_http2_substream['http2.request.full_uri'] for s in self.headers_streams}
162 assert len(uris) == 1, uris
163 return next(iter(uris))
164
165 def __str__(self):
166 return (
167 f"Request [#{','.join(map(str, self.frames_nbs))}]: {len(self.headers_streams)}h + {len(self.data_streams)}d substreams\n\t"
168 f"URI: {self.uri}\n\tHeaders: {self.headers_map}\n\tData: {self.data}"
169 )
170
171
[docs]
172class Http2Response(Http2RequestResponse):
173 """
174 Class to represent a HTTP2 response. It contains the headers and data of the response.
175
176 <!> May be empty for convenience (response never received)
177 """
178 def __str__(self):
179 return (
180 f"Response [#{','.join(map(str, self.frames_nbs))}]: {len(self.headers_streams)}h + {len(self.data_streams)}d substreams\n\t"
181 f"Headers: {self.headers_map}\n\tData: {self.data}"
182 )
183
184
[docs]
185class Http2Stream:
186 """
187 Class to represent an entire HTTP2 stream (multiple substreams). It contains the request and response objects.
188 Http2Stream represents a single HTTP2 stream that can contain multiple substreams as follows:
189
190 .. code-block::
191
192 +-------------------------------------- (tcp stream, http2 stream)
193 | Http2SubStream 1 | Request headers (type: 1)
194 | Http2SubStream ... | Request data (type: 0, flags: 0x0) - partial data
195 | Http2SubStream 3 | Request data (type: 0, flags: 0x1) - end of stream, contains reassembled data
196 | (Http2SubStream 4 | Request trailers (type: 1))
197 +--------------------------------------
198 | Http2SubStream 5 | Response headers (type: 1)
199 | Http2SubStream ... | Response data (type: 0, flags: 0x0) - partial data
200 | Http2SubStream 7 | Response data (type: 0, flags: 0x1) - end of stream, contains reassembled data
201 | (Http2SubStream 8 | Response trailers (type: 1))
202 +--------------------------------------
203
204 Each HTTP2 stream is uniquely identified by a tuple (tcp stream index, http2 stream index)
205 and contains both request and response objects.
206 """
[docs]
207 def __init__(self, tcp_stream_id: int, http2_stream_id: int, community_id: str):
208 """
209 Defines a HTTP2 stream for the given TCP stream and HTTP2 stream.
210
211 :param tcp_stream_id: the ID of the TCP stream
212 :param http2_stream_id: the ID of the HTTP2 stream
213 :param community_id: the community ID (i.e. TCP|UDP + ips & ports) for this conversation
214 """
215 self.tcp_stream_id = tcp_stream_id
216 self.http2_stream_id = http2_stream_id
217 self.community_id = community_id
218 self.request: Optional[Http2Request] = None
219 self.response: Optional[Http2Response] = None
220 self.substreams: list[Http2Substream] = []
221
222 @property
223 def id(self) -> tuple[int, int]:
224 return (self.tcp_stream_id, self.http2_stream_id)
225
[docs]
226 def append(self, raw_http2_substream: Mapping[str, Any], parent_layers: DictLayers) -> None:
227 """
228 Append a new substream to the HTTP2 stream.
229
230 :param substream: the substream to be added
231 :param parent_layers: all layers of the frame containing the substream (a frame can contain multiple substreams)
232 """
233 self.substreams.append(Http2Substream(raw_http2_substream, parent_layers))
234
235 @property
236 def waiting_duration(self) -> float:
237 if not self.response:
238 return 0
239 assert self.request, self.id
240 start_stream = self.request.substreams[-1]
241 resp_stream = self.response.substreams[0]
242 return round(1000 * (resp_stream.timestamp - start_stream.timestamp), 2)
243
[docs]
244 def har_entry(self) -> Optional[dict[str, Any]]:
245 """
246 Create a HAR entry for the HTTP2 stream. It contains the request and response objects.
247
248 :return: the HAR entry for the HTTP2 stream
249 """
250 assert self.request is not None, self.id
251 assert self.response is not None, self.id
252 if not self.request:
253 assert not self.response, self.id
254 return None
255 first_stream = self.request.headers_streams[0]
256 return har_entry_with_common_fields({
257 '_timestamp': first_stream.timestamp,
258 'timings': {
259 'send': self.request.get_duration_ms(),
260 'wait': self.waiting_duration,
261 'receive': self.response.get_duration_ms(),
262 },
263 'serverIPAddress': first_stream.dst_ip,
264 '_communityId': self.community_id,
265 'request': Http2Helper.to_har(self.request),
266 'response': Http2Helper.to_har(self.response),
267 })
268
269 @staticmethod
270 def _get_raw_data_one_substream(raw_http2_substream: Mapping[str, Any]) -> Payload:
271 """
272 Note:
273 - when dealing with a reassembled data substream, `http2.data.data_raw` MAY not contain all data
274 - if the payload was compressed, tshark decompresses ALL data for us(even if data is reassembled)
275 under `Content-encoded entity body ...` -> `http2.data.data_raw` key, so we check it first
276 """
277 for k, v in raw_http2_substream.items():
278 if k.lower().startswith('content-encoded entity body '):
279 assert isinstance(v, dict), (k, v)
280 if 'http2.data.data_raw' not in v:
281 if 'data_raw' in v: # special case for failed decompression (not observed but as http protocol?!)
282 return Payload(get_tshark_bytes_from_raw(v['data_raw']))
283 # also happens in special case of empty decompressed payload (observed)
284 assert v['http2.data.data'] == '', v
285 return Payload(get_tshark_bytes_from_raw(v.get('http2.data.data_raw')))
286 if 'http2.body.fragments' in raw_http2_substream:
287 return Payload(get_tshark_bytes_from_raw(raw_http2_substream['http2.body.fragments']['http2.body.reassembled.data_raw']))
288 return Payload(get_tshark_bytes_from_raw(raw_http2_substream.get('http2.data.data_raw')))
289
[docs]
290 @classmethod
291 def get_raw_data(cls, raw_http2_substreams: Sequence[Mapping[str, Any]]) -> Payload:
292 """
293 Find the data in the substreams.
294
295 :param raw_http2_substreams: the data substreams to be analyzed
296 :return: the raw reassembled data if it exists, otherwise an empty Payload
297 """
298 # 1) search for the unique substream with reassembled data if present
299 substreams_reassembled = {
300 ix: raw_http2_substream for ix, raw_http2_substream in enumerate(raw_http2_substreams)
301 if 'http2.body.fragments' in raw_http2_substream
302 }
303 if substreams_reassembled:
304 # should be unique and for last data substream (on rare cases: != at end of stream)
305 assert len(substreams_reassembled) == 1, substreams_reassembled
306 ix_reassembled, substream_reassembled = next(iter(substreams_reassembled.items()))
307 # assert substream_reassembled['http2.flags'] & 0x01, substream_reassembled
308 assert ix_reassembled == len(raw_http2_substreams) - 1, raw_http2_substreams
309 return cls._get_raw_data_one_substream(substream_reassembled)
310 # 2) if there is none (which happens) we manually concatenate fragments
311 # <!> decompression for overall content is NOT implemented (should not happen?!)
312 return Payload.concat(*(cls._get_raw_data_one_substream(ss) for ss in raw_http2_substreams))
313
[docs]
314 def process(self) -> None:
315 """
316 Process the substreams and create the request and response objects accordingly. Substreams are processed in
317 order, the first substreams are request headers, followed by request data, and finally the response headers and
318 data. The reassembled data is used to create the request and response objects.
319
320 Request substreams are identified by the presence of the 'http2.request.full_uri' key in the raw stream.
321 If no response substream is found, the request object is created with the first substreams.
322
323 It retrieves the source and destination IP addresses from the first substream to identify the substreams that
324 belong to the request. The response substreams are identified by checking their source IP address matches
325 the destination IP address of the first substream.
326 """
327 assert self.substreams, self.id
328
329 # Find a request frame and its associated IPs
330 src, dst = None, None
331 for substream in self.substreams:
332 if 'http2.request.full_uri' in substream.raw_http2_substream: # This is a request
333 src, dst = substream.src_ip_port, substream.dst_ip_port
334 break
335 assert src and dst, self.substreams
336 assert src != dst, src
337
338 # Create the request and response objects with their associated substreams
339 req_substreams = [substream for substream in self.substreams if substream.src_ip_port == src]
340 resp_substreams = [substream for substream in self.substreams if substream.src_ip_port == dst]
341 assert len(req_substreams) + len(resp_substreams) == len(self.substreams), self.substreams
342 self.request = Http2Request(req_substreams)
343 self.response = Http2Response(resp_substreams) # may be empty
344
345 def __str__(self):
346 return (
347 f'TCP Stream: {self.tcp_stream_id}, '
348 f'HTTP2 Stream: {self.http2_stream_id}'
349 f'\n{self.request}'
350 f'\n{self.response}'
351 )
352
353
[docs]
354class Http2Helper:
355
361
[docs]
362 @staticmethod
363 def substream_is_data(substream: Http2Substream) -> bool:
364 """Returns whether substream is a data substream."""
365 stream_type = substream.http2_type
366 return stream_type == 0
367
390
[docs]
391 @staticmethod
392 def to_har(message: Http2RequestResponse) -> dict[str, Any]:
393 """
394 Convert the HTTP2 request or response to a HAR entry.
395
396 <!> Some HTTP2 responses are missing
397
398 :param message: the HTTP2 request or response to be converted
399 :return: the HAR entry for the HTTP2 request or response
400 """
401 entry = {
402 '_timestamp': message.timestamp if message else None,
403 '_rawFramesNumbers': message.frames_nbs,
404 'httpVersion': message.http_version,
405 'cookies': [],
406 'headers': message.headers,
407 'headersSize': message.header_length,
408 'bodySize': message.body_length,
409 }
410 if message:
411 entry['_communication'] = get_har_communication(message)
412 if isinstance(message, Http2Request):
413 entry |= {
414 'method': message.http_method,
415 'url': message.uri,
416 'queryString': [],
417 }
418 if message.data.size:
419 message.data.update_har_request(entry, message.content_type)
420 else:
421 entry |= {
422 'status': message.http_status,
423 'statusText': '',
424 'redirectURL': '',
425 }
426 message.data.update_har_response(entry, message.content_type)
427 return entry
428
[docs]
429 @staticmethod
430 def get_data(data_substreams: Sequence[Http2Substream]) -> Payload:
431 """
432 Extract the data from the substreams (precondition: all substreams are data substreams).
433
434 :param data_substreams: the data substreams to be analyzed
435 :return: the reassembled data
436 """
437 return Http2Stream.get_raw_data([ss.raw_http2_substream for ss in data_substreams])
438
[docs]
439 @classmethod
440 def get_headers_and_data(cls, substreams: Sequence[Http2Substream]):
441 """
442 Identify the headers and data substreams and return them.
443
444 The substreams are identified by their types:
445 - Headers substream: type 1
446 - Data substream: type 0
447 We ignore the rest of the substreams.
448
449 Note that (flag & 0x01) identify the end of stream, usually it happens for a data-stream
450 but it may also happen for a header-stream (trailers in gRPC),
451 or even never happen.
452
453 :param substreams: the substreams of a HTTP2 stream
454 :return: the headers and data substreams regardless if it is a request or a response
455 """
456 headers: list[NameValueDict] = []
457 headers_streams: list[Http2Substream] = []
458 data_streams: list[Http2Substream] = []
459
460 for substream in substreams:
461 # Parse headers (HTTP2 substream marked as headers)
462 if cls.substream_is_header(substream):
463 headers_streams.append(substream)
464 headers += Http2Helper.get_headers(substream)
465 # Register data substreams
466 if cls.substream_is_data(substream):
467 data_streams.append(substream)
468
469 if substreams:
470 assert headers_streams, (len(substreams), data_streams)
471
472 return headers, Http2Helper.get_data(data_streams), headers_streams, data_streams
473
474
[docs]
475class Http2Traffic:
476 """
477 Class to represent the HTTP2 traffic. It contains the HTTP2 streams and the parsed traffic data.
478
479 In HTTP/2, frames are the smallest unit of communication.
480 Each frame has a specific type and can have associated flags.
481
482 **HTTP/2 frame types and flags:**
483
484
485 HTTP/2 Frame Types:
486
487 - `DATA (0x0)`: carries arbitrary, variable-length sequences of octets associated with a stream.
488 - `HEADERS (0x1)`: used to open a stream and carry a header block fragment.
489 - `PRIORITY (0x2)`: specifies the sender-advised priority of a stream.
490 - `RST_STREAM (0x3)`: abruptly terminates a stream.
491 - `SETTINGS (0x4)`: used to communicate configuration parameters.
492 - `PUSH_PROMISE (0x5)`: used to notify the peer endpoint in advance of streams the sender intends to initiate.
493 - `PING (0x6)`: used to measure round-trip time and ensure the connection is still active.
494 - `GOAWAY (0x7)`: informs the peer to stop creating streams on this connection.
495 - `WINDOW_UPDATE (0x8)`: used to implement flow control.
496 - `CONTINUATION (0x9)`: used to continue a sequence of header block fragments.
497
498 HTTP/2 Frame Flags:
499
500 - `END_STREAM (0x1)`: indicates that the frame is the last one for the current stream.
501 - `END_HEADERS (0x4)`: indicates that the frame contains the entire header block.
502 - `PADDED (0x8)`: indicates that the frame contains padding.
503 - `PRIORITY (0x20)`: indicates that the frame contains priority information.
504
505 **TCP stream ID and the HTTP/2 stream ID**
506 The TCP stream ID identifies a unique TCP connection. Each TCP connection is assigned a unique stream ID,
507 which is used to track the packets that belong to that connection.
508 The HTTP/2 stream ID, within a single TCP connection, multiple HTTP/2 streams can exist. Each HTTP/2 stream is
509 identified by a unique stream ID within the context of that TCP connection. These stream IDs are used to
510 multiplex multiple HTTP/2 requests and responses over a single TCP connection.
511
512 A single TCP stream (connection) can contain multiple HTTP/2 streams. Each HTTP/2 stream is
513 uniquely identified within the context of its TCP stream. The combination of the TCP stream ID and the
514 HTTP/2 stream ID uniquely identifies an HTTP/2 stream within the network traffic.
515 """
516 def __init__(self, traffic: Sequence[DictLayers]):
517 self.traffic = traffic
518 self.stream_pairs: dict[tuple[int, int], Http2Stream] = {}
519 self.parse_traffic()
520
[docs]
521 def parse_traffic(self) -> None:
522 """
523 Parse the traffic and extract the HTTP2 streams. It creates a dictionary for each HTTP2 stream.
524 Each key is a tuple with the TCP stream ID and the HTTP2 stream ID.
525
526 Identify each HTTP2 request and its associated HTTP2 response by following these steps:
527
528 1. Iterate through packets: it loops through all packets obtained from the `traffic` object.
529 2. Extract protocols: for each packet, it extracts the protocols from the `frame.protocols` field.
530 3. Check for HTTP2 protocol: it checks if the packet contains the `http2` protocol.
531 4. Extract the TCP stream ID: it retrieves the TCP stream ID from the `tcp.stream` field.
532 5. Handle HTTP2 layer: it ensures the `http2` layer is a list of HTTP2 stream objects.
533 6. Process each HTTP2 stream: for each HTTP2 stream in the `http2` layer:
534
535 - extract stream information: it retrieves the stream type and stream ID.
536 - filter relevant streams: it ignores streams that are not data (type 0) or headers (type 1).
537 - create or update stream pair: it creates a new tuple of `(tcp_stream_id, http2_stream_id)` if it does not
538 exist and appends the substream to the list.
539 7. Process streams: after assembling the HTTP2 streams, it processes each stream to create the request and
540 response objects.
541 """
542 # Assemble the HTTP2 streams
543 for layers in self.traffic:
544 # Ignore non-http2 packets
545 if 'http2' not in get_protocols(layers):
546 continue
547 tcp_stream_id = get_tcp_stream_id(layers)
548 community_id = get_community_id(layers)
549
550 # HTTP2 layer can be a list of streams or a single stream, force a list
551 http2_layer: list[dict[str, Any]] = layers['http2']
552 if not isinstance(http2_layer, list):
553 http2_layer = [layers['http2']]
554
555 for http2_layer_stream in http2_layer:
556 stream = http2_layer_stream['http2.stream']
557 assert isinstance(stream, dict), type(stream)
558 http2_frame_type = int(stream.get('http2.type', -1))
559 # Ignore streams that are not data or headers
560 if http2_frame_type not in {0, 1}:
561 continue
562 # <!> Edge-case: reassembled body is at top-level instead of nested in its stream
563 if 'http2.body.fragments' in http2_layer_stream:
564 assert 'http2.body.fragments' not in stream, http2_layer_stream
565 stream['http2_layer_stream'] = http2_layer_stream.pop('http2.body.fragments')
566 # Create a new tuple of (tcp_stream_id, http2_stream_id) if it does not exist
567 http2_stream_id = int(stream['http2.streamid'])
568 sid = (tcp_stream_id, http2_stream_id)
569 if sid not in self.stream_pairs:
570 self.stream_pairs[sid] = Http2Stream(*sid, community_id=community_id)
571 else:
572 assert community_id == self.stream_pairs[sid].community_id, (community_id, self.stream_pairs[sid].community_id)
573 # Append the substream to the list
574 self.stream_pairs[sid].append(stream, layers)
575
576 # Process the streams, once for all
577 for http2_stream in self.stream_pairs.values():
578 http2_stream.process()
579
[docs]
580 def get_http2_streams(self):
581 return list(self.stream_pairs.values())
582
[docs]
583 def get_har_entries(self) -> list[HarEntry]:
584 """
585 Convert the HTTP2 traffic to HTTP Archive (HAR) format.
586
587 :return: the HTTP2 traffic in HAR format
588 """
589 entries = []
590 for stream in self.get_http2_streams():
591 har_entry = stream.har_entry()
592 if har_entry:
593 entries.append(har_entry)
594 return entries