1import logging
2from http import HTTPMethod
3from abc import ABC, abstractmethod
4from functools import cached_property
5from dataclasses import dataclass
6from collections import defaultdict
7from collections.abc import Sequence
8from typing import ClassVar, Any
9
10from ...payload import Payload
11from ..layers import FrameMixin, TCPIPMixin, CommunityIDMixin, get_protocols, get_layers_mapping, get_har_communication
12from ..types import HarEntry, DictLayers
13from ..utils import get_tshark_bytes_from_raw, har_entry_with_common_fields
14from .websocket import WebSocketConversation, WebSocketMessagesInNetworkFramePossiblyIncomplete, is_websocket_conversation
15
16
17LOGGER = logging.getLogger(__name__)
18
19HTTP_METHODS = {str(v) for v in HTTPMethod}
20
21
22def _get_raw_headers(http_layer: dict[str, Any], direction: str) -> list[bytes]:
23 raw_headers = http_layer.get(f"http.{direction}.line_raw")
24 if not raw_headers:
25 return []
26 if isinstance(http_layer[f"http.{direction}.line"], str): # only 1 header (dirty structure)
27 raw_headers = [raw_headers]
28 return [get_tshark_bytes_from_raw(h) for h in raw_headers]
29
30
[docs]
31@dataclass(frozen=True)
32class HttpRequestResponse(ABC, FrameMixin, TCPIPMixin, CommunityIDMixin):
33 """
34 Base class for HTTP request and response packets. It wraps the layers data and provides methods to
35 access the relevant information.
36 """
37 layers: DictLayers
38
39 FALLBACK_CONTENT_TYPE: ClassVar[str] = 'application/octet-stream'
40
41 @property
42 def http_layer(self) -> dict[str, Any]:
43 http_layer = self.layers['http']
44 assert isinstance(http_layer, dict), self
45 return http_layer
46
47 @property
48 @abstractmethod
49 def raw_headers(self) -> Sequence[bytes]:
50 pass
51
52 @property
53 def header_length(self) -> int:
54 return len(b''.join(self.raw_headers))
55
56 @property
57 def content_type(self) -> str:
58 if not self.payload:
59 return ''
60 content_type: str | list[str] = self.http_layer.get('http.content_type', self.FALLBACK_CONTENT_TYPE)
61 if isinstance(content_type, list):
62 content_type = content_type[-1] # we take last value when multiple values
63 return content_type
64
[docs]
65 @cached_property
66 def payload(self) -> Payload:
67 raw_data = self.http_layer.get('http.file_data_raw')
68 if raw_data is None:
69 # handle tshark error during decompression
70 for k, v in self.http_layer.items():
71 if k.lower().startswith('content-encoded entity body ') and isinstance(v, dict):
72 raw_data = v['data_raw']
73 break
74 return Payload(get_tshark_bytes_from_raw(raw_data))
75
76 @property
77 def content_length(self) -> int:
78 return self.payload.size
79
93
94 @property
95 def common_har_props(self) -> dict[str, Any]:
96 return {
97 'cookies': [], # TODO?
98 'headers': self.headers,
99 'headersSize': self.header_length,
100 'bodySize': self.content_length,
101 '_timestamp': self.timestamp,
102 '_rawFramesNumbers': [self.frame_nb], # always 1 frame in HTTP1
103 '_communication': get_har_communication(self),
104 }
105
106
[docs]
107@dataclass(frozen=True)
108class HttpRequest(HttpRequestResponse):
109 """
110 Class to represent an HTTP request.
111 """
112 @property
113 def raw_headers(self) -> list[bytes]:
114 return _get_raw_headers(self.http_layer, 'request')
115
116 @property
117 def response_frame_nb(self) -> int:
118 return int(self.http_layer['http.response_in'])
119
[docs]
120 @cached_property
121 def http_version_method(self) -> tuple[str, str]:
122 """
123 Get the HTTP version & method from the packet data.
124 :return: tuple with HTTP version & method
125 """
126 for d in self.http_layer.values():
127 if not isinstance(d, dict) or 'http.request.version' not in d:
128 continue
129 version = d['http.request.version']
130 assert version.startswith('HTTP/1.'), version
131 meth = d['http.request.method']
132 assert meth in HTTP_METHODS, meth
133 return version, meth
134 return 'HTTP/1.1', ''
135
136 @property
137 def sending_duration(self) -> float:
138 return round(1000 * float(self.layers['frame'].get('frame.time_delta', 0)), 2)
139
[docs]
140 def to_har(self) -> dict[str, Any]:
141 """
142 Convert the HTTP request to HTTP Archive (HAR) format.
143 :return: the HTTP request in HAR format
144 """
145 http_version, method = self.http_version_method
146 d = {
147 'method': method,
148 'url': self.uri,
149 'queryString': [],
150 'httpVersion': http_version,
151 **self.common_har_props,
152 }
153 if self.content_length:
154 self.payload.update_har_request(d, self.content_type)
155 return d
156
157 @property
158 def uri(self) -> str:
159 return self.http_layer['http.request.full_uri']
160
161
[docs]
162@dataclass(frozen=True)
163class HttpResponse(HttpRequestResponse):
164 """
165 Class to represent an HTTP response.
166 """
167 @property
168 def raw_headers(self) -> list[bytes]:
169 return _get_raw_headers(self.http_layer, 'response')
170
171 @property
172 def request_frame_nb(self) -> int:
173 return int(self.http_layer['http.request_in'])
174
[docs]
175 @cached_property
176 def http_version_status_code_message(self) -> tuple[str, int, str]:
177 """
178 Retrieve the HTTP version & status code & message.
179 :return: tuple with HTTP version, status code and message
180 """
181 for d in self.http_layer.values():
182 if not isinstance(d, dict) or 'http.response.version' not in d:
183 continue
184 version = d['http.response.version']
185 assert version.startswith('HTTP/1.'), version
186 return version, int(d['http.response.code']), d['http.response.code.desc']
187 return 'HTTP/1.1', 0, ''
188
[docs]
189 def to_har(self):
190 """
191 Convert the HTTP response to HTTP Archive (HAR) format.
192 :return: the HTTP response in HAR format
193 """
194 http_version, status_code, status_message = self.http_version_status_code_message
195 d = {
196 'status': status_code,
197 'statusText': status_message,
198 'redirectURL': '',
199 'httpVersion': http_version,
200 **self.common_har_props,
201 }
202 self.payload.update_har_response(d, self.content_type)
203 return d
204
205 @property
206 def receiving_duration(self) -> float:
207 return round(1000 * float(self.http_layer.get('http.time', 0)), 2)
208
209
[docs]
210class HttpConversation:
211 """
212 Class to represent an HTTP conversation composed of a request and a response.
213
214 If this HTTP conversation is a websocket handshake then it shall also contain the websocket conversation.
215 """
216 def __init__(self, request: HttpRequest, response: HttpResponse):
217 self.request = request
218 self.response = response
219 self.websocket_conversation = (
220 WebSocketConversation(request.src_dst_ip_port)
221 if is_websocket_conversation(
222 request.http_layer,
223 response.http_layer,
224 response_code=response.http_version_status_code_message[1],
225 )
226 else None
227 )
228
229 @property
230 def tcp_stream_id(self) -> int:
231 sid = self.request.tcp_stream_id
232 try:
233 assert sid == self.response.tcp_stream_id, (sid, self.response.tcp_stream_id)
234 except KeyError: # buggy/incomplete response may not have `tcp_stream` but OK
235 pass
236 return sid
237
238 @property
239 def community_id(self) -> str:
240 cid = self.request.community_id
241 try:
242 assert cid == self.response.community_id, (cid, self.response.community_id)
243 except KeyError: # buggy/incomplete response may not have `community_id` but OK
244 pass
245 return cid
246
247 @property
248 def waiting_duration(self) -> float:
249 return round(1000 * (self.response.timestamp - self.request.timestamp), 2)
250
[docs]
251 def to_har(self) -> dict[str, Any]:
252 """
253 Convert the HTTP conversation to HTTP Archive (HAR) format.
254 :return: the HTTP conversation (request and response) in HAR format
255 """
256 return har_entry_with_common_fields({
257 '_timestamp': self.request.timestamp,
258 'timings': {
259 'send': self.request.sending_duration,
260 'wait': self.waiting_duration,
261 'receive': self.response.receiving_duration
262 },
263 'serverIPAddress': self.request.dst_ip,
264 '_communityId': self.community_id,
265 'request': self.request.to_har(),
266 'response': self.response.to_har(),
267 **(
268 self.websocket_conversation.to_har()
269 if self.websocket_conversation is not None
270 else {}
271 ),
272 })
273
274
275DELTA_MS_ORPHANS_AFTER_PENALTY = 50.0
276DELTA_MS_ORPHANS_WINDOW_WARN = (-250.0, 50.0)
277DELTA_MS_ORPHANS_WINDOW_IGNORE = (-2500.0, 500.0)
278
279
[docs]
280class Http1Traffic:
281 """
282 Class to represent HTTP1 network traffic.
283
284 This class is the entry point for parsing HTTP1 network traffic.
285
286 The format of JSON data from tshark is as follows for a single HTTP request:
287
288 - `GET /spi/v2/platforms/ HTTP/1.1\\r\\n`: Contains the HTTP method, URI, and version.
289 - `http.request.version`: The HTTP version used.
290 - `http.request.line`: A list of HTTP headers sent with the request.
291 - `http.host`: The Host header value.
292 - `http.request.full_uri`: The full URI including the scheme (e.g., https).
293 - `http.request_number`: The request number.
294 - `http.response_in`: The response number associated with this request.
295
296 The format of JSON data from tshark is as follows for a single HTTP response:
297
298 - `HTTP/1.1 200 OK\\r\\n`: Contains the HTTP version, status code, and status phrase.
299 - `http.content_type`: The Content-Type header value.
300 - `http.response.line`: A list of HTTP headers sent with the response.
301 - `http.content_encoding`: The Content-Encoding header value.
302 - `http.response_number`: The response number.
303 - `http.time`: The time taken for the response.
304 - `http.request_in`: The request number associated with this response.
305 - `http.response_for.uri`: The URI for which this response is generated.
306 - `http.file_data_raw`: The data in hexadecimal format (requires -x flag).
307 """
308 def __init__(self, traffic: Sequence[DictLayers]):
309 self.traffic = traffic
310 self.conversations: list[HttpConversation] = []
311 self.parse_traffic()
312
[docs]
313 def parse_traffic(self) -> None:
314 """
315 Parse the HTTP network traffic and extract the request-response pairs.
316
317 Identify each HTTP request and its associated HTTP response by following these steps:
318
319 1. Iterate through packets: It loops through all packets obtained from the `traffic` object.
320 2. Check protocols: It checks if the packet contains the `http` protocol by examining the `frame.protocols`
321 field.
322 3.a. If traffic correspond to websocket, try to bind it to the originating HTTP conversation
323 3.b. Otherwise, we identify http requests by checking if the packet contains the `http.request`.
324 4. Find associated response: If the packet is an HTTP request and contains the `http.response_in` key, it
325 retrieves the corresponding response packet using response number and the `layers_mapping`, otherwise
326 it will handle it later with orphan responses logic.
327 5. Create conversation: It creates an `HttpConversation` object with the request and response packets and
328 appends it to the `conversations` list.
329 """
330 layers_mapping = get_layers_mapping(
331 # discard non-http traffic
332 [layers for layers in self.traffic if 'http' in get_protocols(layers)]
333 )
334 websocket_conversations_per_tcp_stream_id = defaultdict[int, list[WebSocketConversation]](list)
335 orphan_requests_per_tcp_stream = defaultdict[int, list[HttpRequest]](list)
336 response_nb_blacklist = set[int]()
337
338 for layers in layers_mapping.values():
339 if 'websocket' in layers:
340 ws_frames = WebSocketMessagesInNetworkFramePossiblyIncomplete(layers) # type: ignore[arg-type]
341 ws_convs = websocket_conversations_per_tcp_stream_id[ws_frames.tcp_stream_id]
342 assert ws_convs, (ws_frames.tcp_stream_id, ws_frames)
343 ws_convs[-1].push(ws_frames)
344 continue
345 if 'http' not in layers:
346 # happens that both 'http' & 'http2' are in `protocols` but only 'http2' in actual layers
347 continue
348 # we only retain HTTP requests from now on
349 request_http_layer: dict[str, Any] | list[dict[str, Any]] = layers['http']
350 if isinstance(request_http_layer, list): # very rare but may happen (simultaneous requests)
351 assert all('http.request' in req_http_i for req_http_i in request_http_layer), layers
352 requests = [HttpRequest(dict(layers) | {'http': req_http_i}) for req_http_i in request_http_layer]
353 elif 'http.request' not in request_http_layer:
354 continue
355 else:
356 requests = [HttpRequest(layers)]
357 for i, request in enumerate(requests):
358 try:
359 response_nb = request.response_frame_nb
360 except KeyError:
361 orphan_requests_per_tcp_stream[request.tcp_stream_id].append(request)
362 continue
363 if response_nb in response_nb_blacklist:
364 assert i > 0, layers # tshark may get confused when having multiple http layers in same frame (like us)
365 LOGGER.warning(f"Ambiguous response #{response_nb} due to multiple HTTP1 requests in same frame #{request.frame_nb}")
366 orphan_requests_per_tcp_stream[request.tcp_stream_id].append(request)
367 continue
368 response_nb_blacklist.add(response_nb)
369 http_conversation = HttpConversation(request, HttpResponse(layers_mapping[response_nb]))
370 self.conversations.append(http_conversation)
371 # handle websocket conversations if needed
372 if http_conversation.websocket_conversation is not None:
373 ws_convs_for_cur_tcp_stream = websocket_conversations_per_tcp_stream_id[http_conversation.tcp_stream_id]
374 open_ws_convs_for_cur_tcp_stream = [ws_conv for ws_conv in ws_convs_for_cur_tcp_stream if not ws_conv.is_closed]
375 if open_ws_convs_for_cur_tcp_stream:
376 raise NotImplementedError(
377 "There are still some opened WebSocket conversations "
378 f"for TCP stream #{http_conversation.tcp_stream_id}: {open_ws_convs_for_cur_tcp_stream}"
379 )
380 ws_convs_for_cur_tcp_stream.append(http_conversation.websocket_conversation)
381
382 # try to match orphan responses with orphan requests (esp. for '206 Partial content' responses)
383 for response_nb, response_layers in layers_mapping.items():
384 response_http_layer = response_layers.get('http') # NOT a list for responses due to earlier check in requests phase
385 if response_nb in response_nb_blacklist or not (response_http_layer and 'http.response' in response_http_layer):
386 continue
387 response = HttpResponse(response_layers)
388 existing_orphan_requests = orphan_requests_per_tcp_stream.get(response.tcp_stream_id, [])
389 possible_requests = sorted([
390 (req_ix, req.frame_nb, delta_ms)
391 for req_ix, req in enumerate(existing_orphan_requests)
392 if DELTA_MS_ORPHANS_WINDOW_IGNORE[0] < (delta_ms := (req.timestamp - response.timestamp) * 1000) < DELTA_MS_ORPHANS_WINDOW_IGNORE[1]
393 ], key=lambda tup: abs(tup[-1]) + DELTA_MS_ORPHANS_AFTER_PENALTY*(0 if tup[-1] <= 0 else 1))
394 _, resp_status_code, _ = response.http_version_status_code_message
395 resp_lbl = f"HTTP1 response (Frame #{response_nb}, TCP stream #{response.tcp_stream_id}, Code {resp_status_code})"
396 if not possible_requests:
397 # TODO? totally skip pairing for 1xx responses?
398 (LOGGER.info if resp_status_code in {100, 102} else LOGGER.warning)(
399 f"Orphan {resp_lbl} did not match with any orphan HTTP1 request"
400 )
401 continue
402 if len(possible_requests) > 1:
403 LOGGER.debug(
404 f"Ambiguous matching of orphan {resp_lbl} with possible orphan requests {[f'#{req_nb}' for _, req_nb, _ in possible_requests]}"
405 )
406 req_ix, req_nb, delta_ms = possible_requests[0] # first is best (sorted)
407 request = existing_orphan_requests.pop(req_ix) # this request is not orphan anymore
408 if not (DELTA_MS_ORPHANS_WINDOW_WARN[0] < delta_ms < DELTA_MS_ORPHANS_WINDOW_WARN[1]):
409 LOGGER.warning(f"Dubious matching of orphan {resp_lbl} with orphan request #{req_nb}")
410
411 http_conv = HttpConversation(request, response)
412 self.conversations.append(http_conv)
413
414 # log any orphan requests remaining
415 for tcp_stream_id, orphan_requests_for_tcp_stream in orphan_requests_per_tcp_stream.items():
416 if orphan_requests_for_tcp_stream:
417 reqs_lbls = [
418 f"Frame #{req.frame_nb}: {' '.join(req.http_version_method)} {req.uri}"
419 for req in orphan_requests_for_tcp_stream
420 ]
421 LOGGER.warning(
422 f"TCP stream #{tcp_stream_id}: some orphan HTTP1 requests remain: {reqs_lbls}"
423 )
424
[docs]
425 def get_har_entries(self) -> list[HarEntry]:
426 """
427 Convert the HTTP network traffic to HTTP Archive (HAR) format.
428 :return: the HTTP network traffic in HAR format
429 """
430 entries = []
431 for http_conversation in self.conversations:
432 entries.append(http_conversation.to_har())
433 return entries