From 9bb3480b72616a77d7da31a039162cfb2c588a93 Mon Sep 17 00:00:00 2001 From: dazzling-no-more <278675588+dazzling-no-more@users.noreply.github.com> Date: Sat, 9 May 2026 21:23:41 +0400 Subject: [PATCH 1/5] feat(youtube): relay_url_patterns + SABR strip + exit-node-full SNI --- .../java/com/therealaleph/mhrv/ConfigStore.kt | 46 + src/bin/ui.rs | 17 + src/config.rs | 273 +++ src/domain_fronter.rs | 495 +++++ src/proxy_server.rs | 1922 ++++++++++++++++- 5 files changed, 2722 insertions(+), 31 deletions(-) diff --git a/android/app/src/main/java/com/therealaleph/mhrv/ConfigStore.kt b/android/app/src/main/java/com/therealaleph/mhrv/ConfigStore.kt index 0b982737..df20839a 100644 --- a/android/app/src/main/java/com/therealaleph/mhrv/ConfigStore.kt +++ b/android/app/src/main/java/com/therealaleph/mhrv/ConfigStore.kt @@ -159,6 +159,31 @@ data class MhrvConfig( */ val youtubeViaRelay: Boolean = false, + /** + * Path-pinned relay routing (Rust `relay_url_patterns`, upstream + * commit b3b9220). Each entry is a `host/path-prefix` (no scheme, + * lowercase) — paths matching go through the Apps Script relay, + * non-matching paths on the same host fall through to a direct + * SNI-rewrite HTTP forward (saving Apps Script quota). + * + * The Rust side prepends the default `youtube.com/youtubei/` + * pattern at startup, with two suppression gates: + * - `youtubeViaRelay = true` (full YT through relay → filter is + * redundant) + * - exit-node "full" mode (every URL must route through the + * second-hop exit node → filter would bypass it) + * Plus, when either gate is active, user-supplied patterns whose + * host overlaps `YOUTUBE_RELAY_HOSTS` (youtube.com, youtu.be, + * youtube-nocookie.com, youtubei.googleapis.com) are dropped at + * startup with a warning, since they would partially defeat the + * full-relay contract. + * + * This Android-side field is for *additional* user entries only — + * round-tripped through config.json so a hand-edited extension + * survives a save. No UI editor (power-user knob). + */ + val relayUrlPatterns: List = emptyList(), + /** UI language toggle. Non-Rust; honoured only by the Android wrapper. */ val uiLang: UiLang = UiLang.AUTO, ) { @@ -240,6 +265,17 @@ data class MhrvConfig( put("tunnel_doh", tunnelDoh) put("block_doh", blockDoh) if (youtubeViaRelay) put("youtube_via_relay", true) + // Trim/drop-empty/dedupe before serializing — same pattern + // as bypass_doh_hosts. Skip the key entirely when the user + // hasn't added any extras so we don't leak an empty array + // into otherwise-clean configs. + val cleanRelayUrlPatterns = relayUrlPatterns + .map { it.trim() } + .filter { it.isNotEmpty() } + .distinct() + if (cleanRelayUrlPatterns.isNotEmpty()) { + put("relay_url_patterns", JSONArray().apply { cleanRelayUrlPatterns.forEach { put(it) } }) + } // Trim/drop-empty/dedupe before serializing — symmetric with the // read-side normalization in loadFromJson(), so a user typing // " doh.foo " or accidentally adding a duplicate doesn't end up @@ -356,6 +392,13 @@ object ConfigStore { if (cleanBypassDohHosts.isNotEmpty()) { obj.put("bypass_doh_hosts", JSONArray().apply { cleanBypassDohHosts.forEach { put(it) } }) } + val cleanRelayUrlPatterns = cfg.relayUrlPatterns + .map { it.trim() } + .filter { it.isNotEmpty() } + .distinct() + if (cleanRelayUrlPatterns.isNotEmpty()) { + obj.put("relay_url_patterns", JSONArray().apply { cleanRelayUrlPatterns.forEach { put(it) } }) + } // Compress with DEFLATE then base64. val jsonBytes = obj.toString().toByteArray(Charsets.UTF_8) @@ -459,6 +502,9 @@ object ConfigStore { bypassDohHosts = obj.optJSONArray("bypass_doh_hosts")?.let { arr -> buildList { for (i in 0 until arr.length()) add(arr.optString(i)) } }?.filter { it.isNotBlank() }.orEmpty(), + relayUrlPatterns = obj.optJSONArray("relay_url_patterns")?.let { arr -> + buildList { for (i in 0 until arr.length()) add(arr.optString(i)) } + }?.filter { it.isNotBlank() }.orEmpty(), connectionMode = when (obj.optString("connection_mode", "vpn_tun")) { "proxy_only" -> ConnectionMode.PROXY_ONLY else -> ConnectionMode.VPN_TUN diff --git a/src/bin/ui.rs b/src/bin/ui.rs index 44442206..c596ada7 100644 --- a/src/bin/ui.rs +++ b/src/bin/ui.rs @@ -251,6 +251,9 @@ struct FormState { google_ip_validation: bool, normalize_x_graphql: bool, youtube_via_relay: bool, + /// Round-tripped from config.json. No UI control yet — power-user + /// edit. See config.rs `relay_url_patterns` (b3b9220). + relay_url_patterns: Vec, passthrough_hosts: Vec, /// Round-tripped from config.json so the UI's save path doesn't /// drop the user's setting. Not currently exposed as a UI control; @@ -385,6 +388,7 @@ fn load_form() -> (FormState, Option) { scan_batch_size: c.scan_batch_size, normalize_x_graphql: c.normalize_x_graphql, youtube_via_relay: c.youtube_via_relay, + relay_url_patterns: c.relay_url_patterns.clone(), passthrough_hosts: c.passthrough_hosts.clone(), block_quic: c.block_quic, disable_padding: c.disable_padding, @@ -424,6 +428,7 @@ fn load_form() -> (FormState, Option) { scan_batch_size: 500, normalize_x_graphql: false, youtube_via_relay: false, + relay_url_patterns: Vec::new(), passthrough_hosts: Vec::new(), block_quic: true, disable_padding: false, @@ -580,6 +585,11 @@ impl FormState { // config-only flag for now. Passed through from the loaded // config if set, otherwise defaults to false. youtube_via_relay: self.youtube_via_relay, + // relay_url_patterns is config-only too (b3b9220). No UI + // editor yet — the default `youtube.com/youtubei/` ships + // automatically; round-trip preserves any extras the user + // added by hand. + relay_url_patterns: self.relay_url_patterns.clone(), // Similarly config-only for now; round-trips through the // file so the UI doesn't drop the user's entries on save. passthrough_hosts: self.passthrough_hosts.clone(), @@ -666,6 +676,12 @@ struct ConfigWire<'a> { normalize_x_graphql: bool, #[serde(skip_serializing_if = "is_false")] youtube_via_relay: bool, + /// Path-prefix relay routing (b3b9220). Default is empty — the + /// built-in `youtube.com/youtubei/` is added at proxy startup, not + /// written into config.json — so configs stay clean unless the user + /// added their own extras. + #[serde(skip_serializing_if = "Vec::is_empty")] + relay_url_patterns: &'a Vec, #[serde(skip_serializing_if = "Vec::is_empty")] passthrough_hosts: &'a Vec, // IP-scan knobs. These used to be missing from the wire struct, so @@ -773,6 +789,7 @@ impl<'a> From<&'a Config> for ConfigWire<'a> { .map(|v| v.iter().map(String::as_str).collect()), normalize_x_graphql: c.normalize_x_graphql, youtube_via_relay: c.youtube_via_relay, + relay_url_patterns: &c.relay_url_patterns, passthrough_hosts: &c.passthrough_hosts, fetch_ips_from_api: c.fetch_ips_from_api, max_ips_to_scan: c.max_ips_to_scan, diff --git a/src/config.rs b/src/config.rs index 02ae2aad..604c12e0 100644 --- a/src/config.rs +++ b/src/config.rs @@ -161,6 +161,44 @@ pub struct Config { #[serde(default)] pub youtube_via_relay: bool, + /// URL path prefixes that are forced through the Apps Script relay + /// (instead of the SNI-rewrite tunnel) — pinned by host AND path, + /// so only the matching paths burn relay quota while other paths + /// on the same host stay on the fast SNI-rewrite forward path. + /// + /// Format: `host/path/prefix` (no scheme). Hosts here are pulled + /// out of the built-in SNI-rewrite suffix list so the proxy MITMs + /// them and can inspect URLs. A request whose URL starts with the + /// pattern goes through the relay; any other path on the same host + /// is forwarded over a fresh TLS connection to `google_ip` with + /// SNI=`front_domain` (i.e. the SNI-rewrite trick at the HTTP + /// layer instead of the TLS-tunnel layer). + /// + /// Default: `youtube.com/youtubei/` is prepended unless + /// `youtube_via_relay = true` OR `exit_node.mode == "full"` is + /// active (in which cases YouTube is fully relayed already, so the + /// per-path filter is redundant — and in exit-node-full the filter + /// would actively bypass the second hop, so it has to stay off). + /// User entries are appended to the default in apps_script mode. + /// + /// **Cannot disable the default by setting an empty list**: + /// `#[serde(default)]` collapses an omitted key and an explicit + /// `[]` to the same `Vec::new()`, so the default is always + /// prepended whenever the gate above doesn't suppress it. To turn + /// off the YouTube path filter entirely, flip `youtube_via_relay + /// = true` (full YT relay, redundant filter) or run in `direct` / + /// `full` mode (no apps_script path → no filter). + /// + /// Why this exists: YouTube's in-page RPC at `/youtubei/v1/...` + /// is where SafeSearch / live-video gating decisions are made. + /// Pre-port, the only fix was `youtube_via_relay = true`, which + /// burnt Apps Script quota on every static asset. Path-pinning + /// the relay to `/youtubei/` recovers the SafeSearch fix at ~1% + /// of the quota cost. Ported from upstream `RELAY_URL_PATTERNS` / + /// `relay_url_patterns` (commit b3b9220). + #[serde(default)] + pub relay_url_patterns: Vec, + /// User-configurable passthrough list. Any host whose name matches /// one of these entries bypasses the Apps Script relay entirely and /// is plain-TCP-passthroughed (optionally through `upstream_socks5`). @@ -540,6 +578,90 @@ fn default_verify_ssl() -> bool { true } +/// Validate a single `relay_url_patterns` entry. The expected shape is +/// `host/path-prefix` (or bare `host`, equivalent to `host/`), with an +/// optional `http://` / `https://` scheme that gets stripped at runtime. +/// +/// Checks: +/// * non-empty after trim +/// * scheme strip leaves a non-empty host +/// * host part is a syntactically valid DNS hostname per RFC 1123: +/// labels of [a-zA-Z0-9-], ≤ 63 chars each, no leading/trailing +/// hyphen, no empty labels (which would mean consecutive dots), +/// total host ≤ 253 chars +/// +/// What's intentionally NOT checked here: +/// * whether the host is in `SNI_REWRITE_SUFFIXES` — that's a runtime +/// `tracing::warn!` (see `ResolvedRouting::skipped_force_mitm_hosts`) +/// because users may want a pattern to be no-op-pulled-out-of-MITM +/// path filter while the matching half still routes via relay. +/// * path syntax — any byte sequence after the first `/` is treated +/// as a literal prefix to `starts_with` against URL paths, by design. +fn validate_relay_url_pattern(p: &str) -> Result<(), String> { + let trimmed = p.trim(); + if trimmed.is_empty() { + return Err("entry is empty / whitespace-only".to_string()); + } + // Strip a scheme (case-insensitive) the same way ResolvedRouting + // does at startup, so user input like "https://Foo.com/path/" is + // validated as its post-normalization form. + let lower = trimmed.to_ascii_lowercase(); + let no_scheme = lower + .strip_prefix("https://") + .or_else(|| lower.strip_prefix("http://")) + .unwrap_or(&lower); + if no_scheme.is_empty() { + return Err("scheme present but no host follows".to_string()); + } + // Bare hosts (no slash) are valid — they mean "any path on this host" + // and round-trip through the same matchers as `host/`. + let host = match no_scheme.find('/') { + Some(i) => &no_scheme[..i], + None => no_scheme, + }; + let host = host.trim_end_matches('.'); + if host.is_empty() { + return Err("host part is empty".to_string()); + } + if host.len() > 253 { + return Err(format!( + "host exceeds 253 characters ({} chars)", + host.len() + )); + } + for label in host.split('.') { + if label.is_empty() { + return Err(format!( + "host '{}' contains an empty label (consecutive or leading dots)", + host + )); + } + if label.len() > 63 { + return Err(format!( + "host label '{}' exceeds 63 characters", + label + )); + } + let bytes = label.as_bytes(); + if bytes[0] == b'-' || bytes[bytes.len() - 1] == b'-' { + return Err(format!( + "host label '{}' starts or ends with a hyphen", + label + )); + } + for c in label.chars() { + if !(c.is_ascii_alphanumeric() || c == '-') { + return Err(format!( + "host label '{}' contains invalid character '{}' \ + (only ASCII letters, digits, and hyphens allowed)", + label, c + )); + } + } + } + Ok(()) +} + impl Config { pub fn load(path: &Path) -> Result { let data = std::fs::read_to_string(path) @@ -624,6 +746,20 @@ impl Config { } } } + // `relay_url_patterns` is documented as `host/path-prefix` (no + // scheme) but `Vec` deserializes anything. Validate the + // shape at load time so a typo like `///` or `host..com/` becomes + // a fail-fast error instead of a late routing surprise. Mirrors + // the fail-fast contract the rest of validate() applies to + // fronting_groups, scan_batch_size, etc. + for (i, p) in self.relay_url_patterns.iter().enumerate() { + if let Err(e) = validate_relay_url_pattern(p) { + return Err(ConfigError::Invalid(format!( + "relay_url_patterns[{}] ('{}'): {}", + i, p, e + ))); + } + } Ok(()) } @@ -860,6 +996,143 @@ mod tests { let cfg: Config = serde_json::from_str(s).unwrap(); assert!(cfg.validate().is_err()); } + + // ── relay_url_patterns validation ──────────────────────────────────── + + #[test] + fn validate_relay_url_pattern_accepts_canonical_forms() { + // The default pattern. + assert!(validate_relay_url_pattern("youtube.com/youtubei/").is_ok()); + // Bare host (any path) — equivalent to `host/`. + assert!(validate_relay_url_pattern("youtube.com").is_ok()); + // Trailing dot on host (FQDN form). + assert!(validate_relay_url_pattern("youtube.com./youtubei/").is_ok()); + assert!(validate_relay_url_pattern("youtube.com.").is_ok()); + // Mixed case is fine — runtime lowercases before matching. + assert!(validate_relay_url_pattern("YouTube.com/YouTubei/").is_ok()); + // Scheme is stripped at runtime; validator accepts both forms. + assert!(validate_relay_url_pattern("https://youtube.com/youtubei/").is_ok()); + assert!(validate_relay_url_pattern("http://youtube.com/api/").is_ok()); + assert!(validate_relay_url_pattern("HTTPS://YouTube.com/api/").is_ok()); + // Multi-label hosts, hyphens inside labels. + assert!(validate_relay_url_pattern("api-v2.example.com/path/").is_ok()); + assert!(validate_relay_url_pattern("foo-bar.googleapis.com/v1/").is_ok()); + // IPv4 literal — labels are all-digits, still passes the + // alphanumeric+hyphen rule. Realistic for `hosts`-overridden + // edges or local testing. + assert!(validate_relay_url_pattern("1.2.3.4/path/").is_ok()); + // Path can be anything past the first `/` — it's just a + // `starts_with` prefix. + assert!(validate_relay_url_pattern("youtube.com/v1.0/").is_ok()); + assert!(validate_relay_url_pattern("youtube.com/api?weird=ok").is_ok()); + // Whitespace surrounding the entry is trimmed. + assert!(validate_relay_url_pattern(" youtube.com/youtubei/ ").is_ok()); + } + + #[test] + fn validate_relay_url_pattern_rejects_empty() { + assert!(validate_relay_url_pattern("").is_err()); + assert!(validate_relay_url_pattern(" ").is_err()); + assert!(validate_relay_url_pattern("\t\n").is_err()); + } + + #[test] + fn validate_relay_url_pattern_rejects_missing_host() { + // Just a path. + assert!(validate_relay_url_pattern("/").is_err()); + assert!(validate_relay_url_pattern("/api/").is_err()); + // Scheme but no host. + assert!(validate_relay_url_pattern("https://").is_err()); + assert!(validate_relay_url_pattern("https:///path/").is_err()); + // Just dots. + assert!(validate_relay_url_pattern(".").is_err()); + assert!(validate_relay_url_pattern("./api/").is_err()); + // Garbage. + assert!(validate_relay_url_pattern("///").is_err()); + } + + #[test] + fn validate_relay_url_pattern_rejects_malformed_host() { + // Consecutive dots → empty label in the middle. + assert!(validate_relay_url_pattern("host..com/path/").is_err()); + // Leading hyphen on a label. + assert!(validate_relay_url_pattern("-host.com/path/").is_err()); + // Trailing hyphen on a label. + assert!(validate_relay_url_pattern("host-.com/path/").is_err()); + assert!(validate_relay_url_pattern("foo.host-.com/").is_err()); + // Whitespace inside the host. + assert!(validate_relay_url_pattern("host with space/path/").is_err()); + // Underscore — disallowed in DNS hostnames per RFC 1123. + assert!(validate_relay_url_pattern("ho_st.com/path/").is_err()); + // Special characters that signal user pasted a URL with auth / + // query / fragment into the host slot. + assert!(validate_relay_url_pattern("user@host.com/path/").is_err()); + assert!(validate_relay_url_pattern("ho?st.com/path/").is_err()); + assert!(validate_relay_url_pattern("ho#st.com/path/").is_err()); + } + + #[test] + fn validate_relay_url_pattern_rejects_oversized_labels() { + // Per RFC 1123: each label ≤ 63 chars. + let long_label = "a".repeat(64); + let pat = format!("{}.com/path/", long_label); + assert!(validate_relay_url_pattern(&pat).is_err()); + + // Total host ≤ 253 chars. + let many_labels: Vec = (0..40).map(|i| format!("label{:02}", i)).collect(); + let very_long_host = many_labels.join("."); + // many_labels has 40 entries of 7 chars + 39 dots = 319 > 253. + let pat = format!("{}/path/", very_long_host); + assert!(validate_relay_url_pattern(&pat).is_err()); + } + + #[test] + fn config_validate_surfaces_relay_pattern_errors_with_index_and_pattern() { + // End-to-end: a malformed entry must fail Config::validate() with + // an error that names both the index and the offending pattern, + // so a user staring at a multi-line config can locate it. Mirrors + // the fronting_groups error shape. + let s = r#"{ + "mode": "apps_script", + "auth_key": "secret-test-secret-test", + "script_id": "X", + "relay_url_patterns": [ + "youtube.com/youtubei/", + "host..com/oops/" + ] + }"#; + let cfg: Config = serde_json::from_str(s).unwrap(); + let err = cfg.validate().expect_err("malformed entry must fail"); + let msg = format!("{}", err); + assert!( + msg.contains("relay_url_patterns[1]"), + "error must name the entry index: {}", + msg + ); + assert!( + msg.contains("host..com/oops/"), + "error must echo the offending pattern: {}", + msg + ); + } + + #[test] + fn config_validate_accepts_well_formed_relay_patterns() { + // Mixed canonical / scheme-prefixed / bare-host entries all pass. + let s = r#"{ + "mode": "apps_script", + "auth_key": "secret-test-secret-test", + "script_id": "X", + "relay_url_patterns": [ + "youtube.com/youtubei/", + "https://googleapis.com/api/", + "studio.youtube.com", + "1.2.3.4/health/" + ] + }"#; + let cfg: Config = serde_json::from_str(s).unwrap(); + cfg.validate().expect("well-formed patterns must validate"); + } } #[cfg(test)] diff --git a/src/domain_fronter.rs b/src/domain_fronter.rs index ed8e3554..a80afd27 100644 --- a/src/domain_fronter.rs +++ b/src/domain_fronter.rs @@ -1645,6 +1645,40 @@ impl DomainFronter { url }; + // SABR quality-track strip for YouTube `/videoplayback` POST bodies. + // Has to run before the exit-node short-circuit so the exit-node + // path also benefits — same quota-relief reason as the relay path + // (Apps Script's UrlFetchApp + the exit node both cap at ~10 MB). + // Pure transform on body bytes; no-op on non-SABR payloads. + // + // Host-gated to YouTube's videoplayback hosts so an unrelated + // service that happens to expose a `/videoplayback` endpoint + // serving a protobuf-shaped POST body (with top-level fields 2 + // and 3) doesn't get its field-3 entries silently rewritten. + // YouTube's videoplayback URLs live on either `*.googlevideo.com` + // (the chunk CDN) or `youtube.com` itself; both are gated. + let stripped_body; + let body: &[u8] = if method == "POST" + && !body.is_empty() + && url.contains("/videoplayback") + && url_host_is_youtube_video_endpoint(url) + { + let s = strip_sabr_quality_tracks(body); + if s.len() != body.len() { + tracing::debug!( + "SABR strip: removed {} quality-track bytes from {}", + body.len() - s.len(), + url.split('?').next().unwrap_or(url), + ); + stripped_body = s; + stripped_body.as_slice() + } else { + body + } + } else { + body + }; + // Exit-node short-circuit: route through the configured second-hop // relay (Deno Deploy / fly.io / etc.) for hosts that need a // non-Google exit IP. The cache + coalesce layer below is bypassed @@ -3474,6 +3508,213 @@ fn status_reason(status: u16) -> &'static str { } } +/// True if `url`'s host is one of YouTube's `/videoplayback` endpoints. +/// Gates `strip_sabr_quality_tracks` so an unrelated service that +/// happens to expose `/videoplayback` and serves a protobuf-shaped body +/// with top-level fields 2 and 3 doesn't get its field-3 entries +/// silently rewritten. +/// +/// Two host families serve `/videoplayback`: +/// +/// * `*.googlevideo.com` — the YouTube chunk CDN. Most segment-fetch +/// POSTs land here on subdomains like `rrx---sn-xxx.googlevideo.com`. +/// * `youtube.com` (and subdomains) — direct same-origin endpoints +/// used by the YouTube web client in some client variants. +/// +/// Match is case-insensitive and trailing-dot tolerant. Anything else +/// returns false; the strip is a no-op. +fn url_host_is_youtube_video_endpoint(url: &str) -> bool { + let host = match extract_host(url) { + Some(h) => h, + None => return false, + }; + let h = host.trim_end_matches('.'); + if h.is_empty() { + return false; + } + const YT_VIDEOPLAYBACK_SUFFIXES: &[&str] = &[ + "googlevideo.com", + "youtube.com", + ]; + YT_VIDEOPLAYBACK_SUFFIXES + .iter() + .any(|s| h == *s || h.ends_with(&format!(".{}", s))) +} + +/// Strip top-level field-3 (quality-track selection) entries from a SABR +/// segment-fetch protobuf body. +/// +/// YouTube's SABR (Server-Adaptive Bitrate) `videoplayback` POST bodies +/// come in two distinct message shapes: +/// +/// * **Segment-fetch** — carries field-2 (tag `0x12`) byte-range entries +/// for video/audio segments. Field-3 (tag `0x1a`) entries here are +/// quality-track selectors that ask googlevideo to bundle multiple +/// simultaneous quality tracks into one response, easily exceeding +/// Apps Script `UrlFetchApp`'s ~10 MB response buffer → 502. Stripping +/// them forces a single-track response that fits. +/// +/// * **Session-init** — carries field-5 (tag `0x2a`) entries and *no* +/// field-2 entries. Field-3 here is essential session metadata +/// (language, viewer state, ...). Stripping it corrupts the init +/// handshake → CDN returns 403. +/// +/// Heuristic: only strip field-3 when at least one field-2 entry is also +/// present at the top level (segment-fetch shape). Otherwise return the +/// body unchanged. +/// +/// Only top-level fields are inspected; nested messages are left intact. +/// On a malformed body (truncated tag, unknown wire type) the unparsed +/// tail is appended verbatim so a corrupt request is never silently +/// truncated. Ported from upstream `_strip_sabr_quality_tracks` +/// (commits 9b6d03e + 33db28a). +pub(crate) fn strip_sabr_quality_tracks(body: &[u8]) -> Vec { + // Phase 1: single pass — collect (field_number, start, end) for every + // top-level field. We need to know whether field 2 exists before deciding + // to strip, but a two-pass walk would be wasteful. + let mut segments: Vec<(u32, usize, usize)> = Vec::new(); + let mut has_field2 = false; + let mut has_field3 = false; + let mut i = 0usize; + let n = body.len(); + let mut tail_start = n; + + 'outer: while i < n { + let seg_start = i; + + // Decode varint tag. + let mut tag: u64 = 0; + let mut shift: u32 = 0; + let mut tag_complete = false; + while i < n { + let b = body[i]; + i += 1; + tag |= ((b & 0x7F) as u64) << shift; + if b & 0x80 == 0 { + tag_complete = true; + break; + } + shift += 7; + if shift >= 64 { + // Pathologically long varint — bail. + tail_start = seg_start; + break 'outer; + } + } + if !tag_complete { + tail_start = seg_start; + break; + } + + let field_number = (tag >> 3) as u32; + let wire_type = (tag & 0x07) as u8; + + // Each branch advances `i` past the field's payload. Truncation + // (running off the end before the payload is whole) bails out + // with `tail_start = seg_start` so the malformed segment and + // everything after it is preserved verbatim — never silently + // dropped, never half-stripped. + match wire_type { + 0 => { + // varint payload: bytes with high bit set are + // continuation; first byte with high bit clear is the + // terminator. EOF before terminator = truncated. + let mut term = false; + while i < n { + let b = body[i]; + i += 1; + if b & 0x80 == 0 { + term = true; + break; + } + } + if !term { + tail_start = seg_start; + break; + } + } + 1 => { + // 64-bit fixed + if n - i < 8 { + tail_start = seg_start; + break; + } + i += 8; + } + 2 => { + // length-delimited: varint length, then `val_len` bytes. + let mut val_len: u64 = 0; + let mut shift: u32 = 0; + let mut len_complete = false; + while i < n { + let b = body[i]; + i += 1; + val_len |= ((b & 0x7F) as u64) << shift; + if b & 0x80 == 0 { + len_complete = true; + break; + } + shift += 7; + if shift >= 64 { + tail_start = seg_start; + break 'outer; + } + } + if !len_complete { + tail_start = seg_start; + break; + } + // Payload truncated: declared length runs past buffer end. + // Bail BEFORE recording the segment so a half-present + // field-3 isn't accidentally stripped from the output. + let val_len = val_len as usize; + if val_len > n - i { + tail_start = seg_start; + break; + } + i += val_len; + } + 5 => { + // 32-bit fixed + if n - i < 4 { + tail_start = seg_start; + break; + } + i += 4; + } + _ => { + // Unknown wire type — bail, tail copied verbatim. + tail_start = seg_start; + break; + } + } + + if field_number == 2 { + has_field2 = true; + } else if field_number == 3 { + has_field3 = true; + } + segments.push((field_number, seg_start, i)); + } + + // Phase 2: only strip when this is a segment-fetch body (has field 2) + // AND there's at least one field-3 entry to strip. + if !has_field2 || !has_field3 { + return body.to_vec(); + } + + let mut out = Vec::with_capacity(body.len()); + for (field_number, seg_start, seg_end) in segments { + if field_number != 3 { + out.extend_from_slice(&body[seg_start..seg_end]); + } + } + if tail_start < n { + out.extend_from_slice(&body[tail_start..]); + } + out +} + fn extract_host(url: &str) -> Option { let after_scheme = url.split_once("://").map(|(_, rest)| rest).unwrap_or(url); let authority = after_scheme.split('/').next().unwrap_or(""); @@ -5686,4 +5927,258 @@ hello"; } server.await.unwrap(); } + + // ── SABR quality-track strip (commits 9b6d03e + 33db28a) ───────────── + + /// Encode a single varint at the front of a buffer. + fn enc_varint(out: &mut Vec, mut v: u64) { + while v >= 0x80 { + out.push((v as u8) | 0x80); + v >>= 7; + } + out.push(v as u8); + } + + /// Encode a single length-delimited (wire-type 2) field. + fn enc_length_delim(out: &mut Vec, field: u32, payload: &[u8]) { + enc_varint(out, ((field as u64) << 3) | 2); + enc_varint(out, payload.len() as u64); + out.extend_from_slice(payload); + } + + /// Encode a single varint (wire-type 0) field. + fn enc_varint_field(out: &mut Vec, field: u32, value: u64) { + // Wire-type 0 is the zero bits; explicit `| 0` would be clippy + // `identity_op`. The shift is the entire encoding. + enc_varint(out, (field as u64) << 3); + enc_varint(out, value); + } + + #[test] + fn sabr_strip_segment_fetch_drops_field3() { + // Segment-fetch shape: has both field-2 (range descriptor) and + // field-3 (quality-track selector). Strip should remove only the + // field-3 entries, leaving everything else intact. + let mut body: Vec = Vec::new(); + enc_length_delim(&mut body, 2, b"range-descriptor-1"); + enc_length_delim(&mut body, 3, b"quality-track-selector-1"); + enc_length_delim(&mut body, 2, b"range-descriptor-2"); + enc_length_delim(&mut body, 3, b"quality-track-selector-2"); + enc_varint_field(&mut body, 4, 12345); // some other field + + let mut expected: Vec = Vec::new(); + enc_length_delim(&mut expected, 2, b"range-descriptor-1"); + enc_length_delim(&mut expected, 2, b"range-descriptor-2"); + enc_varint_field(&mut expected, 4, 12345); + + assert_eq!(strip_sabr_quality_tracks(&body), expected); + } + + #[test] + fn sabr_strip_session_init_leaves_field3_alone() { + // Session-init shape: has field-5 entries and field-3, but NO + // field-2. Field-3 here is essential session metadata — + // stripping it would corrupt the handshake → CDN 403. Heuristic: + // require field-2 presence before stripping field-3. + let mut body: Vec = Vec::new(); + enc_length_delim(&mut body, 5, b"session-context"); + enc_length_delim(&mut body, 3, b"language-and-state-metadata"); + enc_varint_field(&mut body, 7, 1); + + let original = body.clone(); + // Untouched. + assert_eq!(strip_sabr_quality_tracks(&body), original); + } + + #[test] + fn sabr_strip_no_field3_is_noop() { + // Plain segment-fetch with only field-2 entries — nothing to strip. + let mut body: Vec = Vec::new(); + enc_length_delim(&mut body, 2, b"only-range-descriptors"); + enc_length_delim(&mut body, 2, b"more-range-descriptors"); + + let original = body.clone(); + assert_eq!(strip_sabr_quality_tracks(&body), original); + } + + #[test] + fn sabr_strip_empty_body_is_noop() { + assert_eq!(strip_sabr_quality_tracks(b""), b""); + } + + #[test] + fn sabr_strip_truncated_tag_preserves_remainder_verbatim() { + // A field-2 entry followed by a truncated varint tag (high bit + // set, no continuation byte). Strip should bail at the truncated + // tag, copying the remaining bytes verbatim — never silently + // dropping the tail. + let mut body: Vec = Vec::new(); + enc_length_delim(&mut body, 2, b"first-entry"); + // Truncated: high bit set, then EOF. + body.push(0x80); + + let result = strip_sabr_quality_tracks(&body); + // Body had no field-3 to strip → entire input returned. + assert_eq!(result, body); + } + + #[test] + fn sabr_strip_truncated_tag_after_field3_preserves_tail() { + // field-2, field-3, then truncated. Strip should remove the + // field-3 entry (segment-fetch shape) and copy the truncated + // tail verbatim. + let mut body: Vec = Vec::new(); + enc_length_delim(&mut body, 2, b"range-desc"); + enc_length_delim(&mut body, 3, b"quality-track"); + // truncated tag + body.push(0x80); + + let mut expected: Vec = Vec::new(); + enc_length_delim(&mut expected, 2, b"range-desc"); + expected.push(0x80); + + assert_eq!(strip_sabr_quality_tracks(&body), expected); + } + + #[test] + fn sabr_strip_unknown_wire_type_preserves_remainder() { + // Wire type 6 (unused / reserved) — the strip should bail at + // the unknown wire type and copy the rest verbatim. Build it + // manually so we control the wire-type bits. + let mut body: Vec = Vec::new(); + enc_length_delim(&mut body, 2, b"first-range"); + // tag = (field 9 << 3) | wire 6 = 0x4E + body.push(0x4E); + body.extend_from_slice(b"\x01\x02\x03"); // some bytes after + + // No field-3 ever seen → nothing to strip → full body returned. + assert_eq!(strip_sabr_quality_tracks(&body), body); + } + + #[test] + fn sabr_strip_truncated_field3_payload_preserves_segment_verbatim() { + // field-2 (legitimate range descriptor) followed by a TRUNCATED + // field-3 entry: the length varint says 100 bytes but only 5 are + // present. The original `i.saturating_add(val_len).min(n)` would + // have clamped to EOF and let the segment be stripped, silently + // dropping the partial bytes. Correct behaviour: bail at the + // truncated payload and emit everything from there to EOF + // verbatim, untouched. + let mut body: Vec = Vec::new(); + enc_length_delim(&mut body, 2, b"good-range"); + // Manually emit a truncated field-3 length-delimited header: + // tag = (field 3 << 3) | 2 = 0x1A, len = 100, then only 5 bytes. + body.push(0x1A); + body.push(100); // declares 100-byte payload + body.extend_from_slice(b"short"); + + let mut expected: Vec = Vec::new(); + enc_length_delim(&mut expected, 2, b"good-range"); + // Truncated tail copied verbatim from where field-3 starts. + expected.push(0x1A); + expected.push(100); + expected.extend_from_slice(b"short"); + + assert_eq!(strip_sabr_quality_tracks(&body), expected); + } + + #[test] + fn sabr_strip_truncated_field2_payload_preserves_segment_verbatim() { + // Even segment-fetch detection must not over-eagerly believe a + // truncated field-2 is real — the segment-fetch heuristic only + // fires on COMPLETE field-2 entries. Here field-2 is truncated; + // the parser bails at it and the malformed tail is preserved. + let mut body: Vec = Vec::new(); + // Tag = field 2, wire 2 = 0x12, declared len = 50, only 3 bytes. + body.push(0x12); + body.push(50); + body.extend_from_slice(b"abc"); + + // No prior field-2 OR field-3 captured → strip is a no-op + // and returns the buffer unchanged. + assert_eq!(strip_sabr_quality_tracks(&body), body); + } + + #[test] + fn sabr_strip_truncated_fixed_width_preserves_segment_verbatim() { + // 64-bit fixed (wire type 1) — only 3 of 8 bytes present. + // Without a length-check this used to clamp via .min(n) and + // declare the field "complete." Now bails at the segment. + let mut body: Vec = Vec::new(); + enc_length_delim(&mut body, 2, b"r"); + enc_length_delim(&mut body, 3, b"q"); + // Tag = field 4, wire 1 = 0x21, then only 3 bytes follow. + body.push(0x21); + body.extend_from_slice(b"\x01\x02\x03"); + + // The malformed tail starts at the truncated fixed-width tag, + // so the field-2 / field-3 we did parse get emitted (segment- + // fetch shape, field-3 stripped), then the tail verbatim. + let mut expected: Vec = Vec::new(); + enc_length_delim(&mut expected, 2, b"r"); + // field-3 stripped here + expected.push(0x21); + expected.extend_from_slice(b"\x01\x02\x03"); + assert_eq!(strip_sabr_quality_tracks(&body), expected); + } + + // ── SABR host gate (defense against unrelated /videoplayback) ──────── + + #[test] + fn sabr_host_gate_recognises_youtube_video_endpoints() { + // YouTube chunk CDN and yt itself. + assert!(url_host_is_youtube_video_endpoint( + "https://rrx---sn-xxx.googlevideo.com/videoplayback?...&itag=18" + )); + assert!(url_host_is_youtube_video_endpoint( + "https://googlevideo.com/videoplayback" + )); + assert!(url_host_is_youtube_video_endpoint( + "https://www.youtube.com/videoplayback" + )); + // Case-insensitive + trailing dot. + assert!(url_host_is_youtube_video_endpoint( + "https://GoogleVideo.com/videoplayback" + )); + assert!(url_host_is_youtube_video_endpoint( + "https://googlevideo.com./videoplayback" + )); + } + + #[test] + fn sabr_host_gate_rejects_unrelated_hosts_with_videoplayback_path() { + // Any service that incidentally has `/videoplayback` must not + // be treated as YouTube SABR. + assert!(!url_host_is_youtube_video_endpoint( + "https://api.example.com/videoplayback" + )); + assert!(!url_host_is_youtube_video_endpoint( + "https://my-company.internal/videoplayback?id=42" + )); + // Suffix-attack: non-googlevideo hosts ending in similar bytes. + assert!(!url_host_is_youtube_video_endpoint( + "https://evilgooglevideo.com/videoplayback" + )); + assert!(!url_host_is_youtube_video_endpoint( + "https://notyoutube.com/videoplayback" + )); + } + + #[test] + fn sabr_strip_truncated_varint_payload_preserves_segment_verbatim() { + // Wire type 0 (varint) with a continuation byte and no terminator. + let mut body: Vec = Vec::new(); + enc_length_delim(&mut body, 2, b"r"); + enc_length_delim(&mut body, 3, b"q"); + // Tag = field 5, wire 0 = 0x28, then a continuation byte that + // never terminates. + body.push(0x28); + body.push(0x80); // continuation, then EOF + + let mut expected: Vec = Vec::new(); + enc_length_delim(&mut expected, 2, b"r"); + expected.push(0x28); + expected.push(0x80); + assert_eq!(strip_sabr_quality_tracks(&body), expected); + } } diff --git a/src/proxy_server.rs b/src/proxy_server.rs index bdd47878..81568522 100644 --- a/src/proxy_server.rs +++ b/src/proxy_server.rs @@ -118,6 +118,26 @@ const YOUTUBE_RELAY_HOSTS: &[&str] = &[ "youtubei.googleapis.com", ]; +/// URL path-prefix patterns that are forced through the Apps Script relay. +/// Each entry is `host/path-prefix` (no scheme, lowercase). The host is +/// pulled out of `SNI_REWRITE_SUFFIXES` so the proxy MITMs and can inspect +/// paths; only URLs starting with the pattern go to relay, all other paths +/// on that host fall through to the SNI-rewrite HTTP forwarder +/// (`forward_via_sni_rewrite_http`) — same SNI-rewrite trick as the +/// CONNECT-tunnel path, but applied at the HTTP layer so we keep MITM +/// for the matching paths. User-supplied entries from +/// `Config::relay_url_patterns` are appended to this default. +/// +/// `youtube.com/youtubei/`: YouTube's in-page RPC layer. Restricted Mode / +/// SafeSearch / live-stream gating decisions land here. Relaying just +/// this prefix recovers the SafeSearch fix that previously required the +/// full `youtube_via_relay = true` knob (which routed every static +/// asset through the relay too). Ported from upstream +/// `RELAY_URL_PATTERNS` (commit b3b9220). +const DEFAULT_RELAY_URL_PATTERNS: &[&str] = &[ + "youtube.com/youtubei/", +]; + /// Built-in list of DNS-over-HTTPS endpoints. CONNECTs to these (when /// `tunnel_doh` is left at the default of `false`, i.e. bypass enabled) /// skip the Apps Script tunnel and exit via plain TCP. Mix of the @@ -156,7 +176,11 @@ const DEFAULT_DOH_HOSTS: &[&str] = &[ "dns.mullvad.net", ]; -fn matches_sni_rewrite(host: &str, youtube_via_relay: bool) -> bool { +fn matches_sni_rewrite( + host: &str, + youtube_via_relay: bool, + force_mitm_hosts: &[String], +) -> bool { let h = host.to_ascii_lowercase(); let h = h.trim_end_matches('.'); @@ -177,11 +201,89 @@ fn matches_sni_rewrite(host: &str, youtube_via_relay: bool) -> bool { } } + // Hosts pulled out of SNI-rewrite by `relay_url_patterns` (b3b9220). + // We need to MITM these so the per-path matcher in + // `handle_mitm_request` can decide between relay and the SNI-rewrite + // HTTP forwarder. Match shape MUST match `host_in_force_mitm_list` + // exactly — otherwise a host pulled out here wouldn't be recognised + // at dispatch and the path filter would silently no-op, which was a + // real bug in the first cut where this list also matched in the + // reverse direction (`forced.ends_with(.h)`). Reverse-matching + // pulled parent SNI suffixes for entries like `studio.youtube.com`, + // making the entire `youtube.com` subtree skip SNI-rewrite while + // dispatch only force-MITM-recognised the literal `studio.youtube.com`. + // One-directional match (`h == forced || h.ends_with(.forced)`) + // pulls only the configured host and its subdomains, leaving sibling + // subdomains on the natural SNI-rewrite path. + for forced in force_mitm_hosts { + if h == *forced || h.ends_with(&format!(".{}", forced)) { + return false; + } + } + SNI_REWRITE_SUFFIXES .iter() .any(|s| h == *s || h.ends_with(&format!(".{}", s))) } +/// True if `url` matches any entry in `patterns`. Each pattern is +/// `host/path-prefix` (no scheme, lowercase). The URL host may have extra +/// subdomains — `www.youtube.com` matches `youtube.com/youtubei/`. Path +/// match is a plain prefix on the URL's path component. +/// +/// Same matching shape as the upstream Python `_url_matches_relay_pattern` +/// (b3b9220): scheme stripped, lowercased, host suffix-anchored, path +/// `startswith`. Used in MITM dispatch to decide relay vs. SNI-rewrite +/// HTTP forward for hosts pulled out of SNI-rewrite. +pub(crate) fn url_matches_relay_pattern(url: &str, patterns: &[String]) -> bool { + if patterns.is_empty() { + return false; + } + let lower = url.to_ascii_lowercase(); + let stripped = lower + .strip_prefix("https://") + .or_else(|| lower.strip_prefix("http://")) + .unwrap_or(&lower); + let (raw_host, url_path) = match stripped.find('/') { + Some(i) => (&stripped[..i], &stripped[i..]), + None => (stripped, "/"), + }; + // Strip an authority's port (`:443`) and any FQDN trailing dot so + // `www.youtube.com.` and `www.youtube.com:443` canonicalise to the + // same form that `host_in_force_mitm_list` and `extract_host` use. + // Without this, dispatch and pattern-match disagree: the host is + // pulled from SNI-rewrite but its `/youtubei/` URL fails the + // pattern check and ends up routed via the SNI-HTTP forwarder. + let host_no_port = raw_host.split(':').next().unwrap_or(raw_host); + let url_host = host_no_port.trim_end_matches('.'); + for p in patterns { + let (pat_host, pat_path) = match p.find('/') { + Some(i) => (&p[..i], &p[i..]), + None => (p.as_str(), "/"), + }; + let host_match = url_host == pat_host || url_host.ends_with(&format!(".{}", pat_host)); + if host_match && url_path.starts_with(pat_path) { + return true; + } + } + false +} + +/// True if the request's host is one we pulled out of SNI-rewrite to +/// support `relay_url_patterns`. Used in `handle_mitm_request` as the +/// gate for the SNI-rewrite-HTTP fallback path: if the host was forced +/// to MITM but the URL didn't match any pattern, we forward over a fresh +/// SNI-rewrite TLS connection instead of burning Apps Script quota. +pub(crate) fn host_in_force_mitm_list(host: &str, list: &[String]) -> bool { + if list.is_empty() { + return false; + } + let h = host.to_ascii_lowercase(); + let h = h.trim_end_matches('.'); + list.iter() + .any(|forced| h == *forced || h.ends_with(&format!(".{}", forced))) +} + fn hosts_override<'a>( hosts: &'a std::collections::HashMap, host: &str, @@ -230,9 +332,41 @@ pub struct RewriteCtx { pub upstream_socks5: Option, pub mode: Mode, /// If true, YouTube traffic bypasses the SNI-rewrite tunnel and - /// goes through the Apps Script relay instead. See config.rs for - /// the trade-off. Issue #102. + /// goes through the Apps Script relay instead. Effective value: + /// `config.youtube_via_relay || (apps_script + exit_node.full)` — + /// when the exit node is in full mode it must intercept all traffic + /// including YouTube, so YT hosts get pulled from SNI-rewrite the + /// same way the explicit toggle does it. Ported from upstream + /// commit 88b2767. Issue #102. pub youtube_via_relay: bool, + /// Resolved URL path-prefix patterns (`host/path-prefix`, lowercase, + /// no scheme) that force the relay path inside MITM. Built at + /// startup from `DEFAULT_RELAY_URL_PATTERNS` plus + /// `Config::relay_url_patterns`. Empty when + /// `youtube_via_relay = true` because YouTube is then fully relayed + /// already and the per-path filter would just be redundant. Used + /// by `handle_mitm_request` to decide relay vs. SNI-rewrite HTTP + /// forward. Ported from upstream `_relay_url_patterns` (b3b9220). + pub relay_url_patterns: Vec, + /// Hosts derived from `relay_url_patterns` that get pulled out of + /// `SNI_REWRITE_SUFFIXES` so the proxy MITMs them and the per-path + /// matcher can run. Lowercase, no scheme. Empty when + /// `relay_url_patterns` is empty. Used in `matches_sni_rewrite` + /// and `host_in_force_mitm_list`. + pub force_mitm_hosts: Vec, + /// Set when `mode == AppsScript && exit_node.enabled && + /// exit_node.mode == "full"` — the same condition that promotes + /// `youtube_via_relay_effective` (commit 88b2767). When true, + /// `handle_mitm_request` MUST NOT use `forward_via_sni_rewrite_http` + /// for non-matching paths, even on hosts in `force_mitm_hosts` — + /// the forwarder dials the Google edge directly, which would + /// completely bypass the second-hop exit node and violate the + /// documented "every URL routes through the exit node" contract + /// (`DomainFronter::exit_node_matches`). User-supplied + /// `relay_url_patterns` are still honoured: matching paths and + /// non-matching paths both end up in `DomainFronter::relay`, + /// which then routes through the exit node. + pub exit_node_full_mode_active: bool, /// User-configured hostnames that should skip the relay entirely /// and pass through as plain TCP (optionally via upstream_socks5). /// See config.rs `passthrough_hosts` for matching rules. Issues #39, #127. @@ -263,6 +397,227 @@ pub struct RewriteCtx { pub fronting_groups: Vec>, } +/// One-shot resolution of the YouTube routing knobs (`youtube_via_relay`, +/// `relay_url_patterns`, `exit_node.mode == "full"`) for a given +/// `Config` + `Mode`. Pulled out of `ProxyServer::new` so it can be +/// unit-tested directly without spinning up the full proxy. +/// +/// Two gates govern the resolved patterns: +/// +/// 1. **Mode gate** — only `apps_script` mode has a relay path to route +/// patterns through. In `direct` mode there's no Apps Script, so +/// pulling hosts out of SNI-rewrite would just send them to raw-TCP +/// fallback (a routing regression). In `full` mode the dispatcher +/// short-circuits to the tunnel mux before MITM ever runs, so +/// patterns would never be consulted. Outside `apps_script` the +/// resolved sets are always empty. +/// +/// 2. **youtube_via_relay-effective gate** — the explicit +/// `youtube_via_relay` toggle OR exit-node-full mode (commit 88b2767). +/// When *either* is on, YouTube is fully relayed already, so the +/// per-path filter is redundant. Worse, in exit-node-full mode the +/// filter is *harmful*: non-matching paths on `youtube.com` would +/// route via `forward_via_sni_rewrite_http`, bypassing +/// `DomainFronter::relay` and with it the exit node — defeating +/// the whole point of full mode. +/// +/// User-supplied `relay_url_patterns` entries always run inside +/// `apps_script` mode regardless of the YT flag; they may target hosts +/// other than `youtube.com` that the user wants path-pinned +/// independently. +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) struct ResolvedRouting { + /// Effective `youtube_via_relay` after OR-ing with exit-node-full + /// mode. Mirrors what `RewriteCtx::youtube_via_relay` ends up with. + pub youtube_via_relay_effective: bool, + /// Resolved patterns, lowercased, scheme-stripped, deduplicated. + /// Empty outside `apps_script` mode and when both gates above + /// allow the defaults to be skipped. + pub relay_url_patterns: Vec, + /// Host parts of `relay_url_patterns` that ARE + /// SNI-rewrite-capable. Pulled out of SNI-rewrite at dispatch time + /// so MITM can run for them. + pub force_mitm_hosts: Vec, + /// Host parts of `relay_url_patterns` that are NOT + /// SNI-rewrite-capable, retained only so `ProxyServer::new` can log + /// a startup warning. Patterns referencing them stay in + /// `relay_url_patterns` (so a matching path still routes through + /// the relay if the host is MITM'd via the regular TLS-detect + /// path), but the path-vs-forwarder filter is inert for them — the + /// forwarder would return a wrong-origin response from the Google + /// edge. + pub skipped_force_mitm_hosts: Vec, + /// User patterns dropped because `youtube_via_relay_effective` is + /// true AND the pattern's host is already covered by + /// `YOUTUBE_RELAY_HOSTS`. Keeping them would partially defeat the + /// "full YT through relay" contract: the path filter would flag + /// non-matching paths as forwarder-eligible, and dispatch would + /// route them via `forward_via_sni_rewrite_http` instead of the + /// relay. Surfaced for the startup warning so the user knows their + /// extra entry was redundant + harmful. + pub suppressed_yt_patterns: Vec, + /// True iff `exit_node.enabled && mode == "full"` AND we're in + /// apps_script mode. Used only for the startup log line that + /// announces the YT-via-relay implication of exit-node-full. + pub exit_node_full_mode_active: bool, +} + +impl ResolvedRouting { + pub(crate) fn from_config(config: &Config, mode: Mode) -> Self { + let exit_node_full_mode = config.exit_node.enabled + && config.exit_node.mode.eq_ignore_ascii_case("full") + && !config.exit_node.relay_url.is_empty() + && !config.exit_node.psk.is_empty(); + let exit_node_full_mode_active = exit_node_full_mode && mode == Mode::AppsScript; + let youtube_via_relay_effective = + config.youtube_via_relay || exit_node_full_mode_active; + + let mut relay_url_patterns: Vec = Vec::new(); + let mut suppressed_yt_patterns: Vec = Vec::new(); + if mode == Mode::AppsScript { + if !youtube_via_relay_effective { + for p in DEFAULT_RELAY_URL_PATTERNS { + relay_url_patterns.push((*p).to_string()); + } + } + for p in &config.relay_url_patterns { + let trimmed = p.trim(); + if trimmed.is_empty() { + continue; + } + let normalized = normalize_pattern(trimmed); + // YT-overlap suppression: when `youtube_via_relay_effective` + // is true, every YT-family host is already pulled out of + // SNI-rewrite by the `YOUTUBE_RELAY_HOSTS` carve-out, so + // every YT request flows through the relay regardless. A + // user pattern targeting a YT host adds it to + // `force_mitm_hosts`, which switches on the path filter; + // non-matching YT paths then route through + // `forward_via_sni_rewrite_http`, partially defeating the + // user's `youtube_via_relay = true` opt-in. Drop the + // pattern entirely (matching paths already go to relay + // without it) and surface it for the startup warning. + let pattern_host = normalized + .split('/') + .next() + .unwrap_or("") + .trim_start_matches('.'); + if youtube_via_relay_effective && host_matches_youtube_relay(pattern_host) { + suppressed_yt_patterns.push(normalized); + continue; + } + relay_url_patterns.push(normalized); + } + let mut seen_patterns: std::collections::HashSet = Default::default(); + relay_url_patterns.retain(|p| seen_patterns.insert(p.clone())); + } + + // Only hosts that would naturally take the SNI-rewrite tunnel + // (i.e. match `SNI_REWRITE_SUFFIXES`) are valid targets for the + // path-level filter. The non-matching path goes through + // `forward_via_sni_rewrite_http`, which dials `google_ip:443` + // with `SNI=front_domain` — the Google edge dispatches by the + // inner `Host` header, but only if that Host is actually served + // by the same edge. A user-supplied pattern like + // `evilsite.com/api/` would otherwise pull `evilsite.com` from + // the (already-not-matching) SNI list as a no-op AND make + // `host_in_force_mitm_list` true, sending non-matching paths + // through the forwarder which would return a wrong-origin + // response from the Google edge — silently treated as success. + // Filter at startup, log the skip, leave the pattern itself + // alone so a matching path still routes via relay if the host + // is reached via a different path (TLS-detect → MITM → relay). + // Fronting-group hosts are NOT eligible either: the forwarder + // only knows `(google_ip, front_domain)`, not the group's + // `(ip, sni)` pair. Path-routing on fronting groups is a + // separate feature. + let mut force_mitm_hosts: Vec = Vec::new(); + let mut skipped_hosts: Vec = Vec::new(); + let mut seen_hosts: std::collections::HashSet = Default::default(); + for p in &relay_url_patterns { + let host_part = p + .split('/') + .next() + .unwrap_or("") + .trim_start_matches('.') + .to_string(); + if host_part.is_empty() || !seen_hosts.insert(host_part.clone()) { + continue; + } + if host_is_sni_rewrite_capable(&host_part) { + force_mitm_hosts.push(host_part); + } else { + skipped_hosts.push(host_part); + } + } + + Self { + youtube_via_relay_effective, + relay_url_patterns, + force_mitm_hosts, + skipped_force_mitm_hosts: skipped_hosts, + suppressed_yt_patterns, + exit_node_full_mode_active, + } + } +} + +/// Canonicalise a `relay_url_patterns` entry to the form the runtime +/// matchers expect: lowercase, no scheme, no trailing dot on the host. +/// Lowercasing happens BEFORE scheme strip so `HTTPS://Foo.com/Bar/` +/// normalises cleanly (`trim_start_matches("https://")` is +/// case-sensitive). Trailing dots on the host (e.g. `foo.com./api/`, +/// FQDN-form) are stripped so they match against the `extract_host` → +/// trim-trailing-dot canonical form. +pub(crate) fn normalize_pattern(raw: &str) -> String { + let lower = raw.trim().to_ascii_lowercase(); + let no_scheme = lower + .strip_prefix("https://") + .or_else(|| lower.strip_prefix("http://")) + .unwrap_or(&lower); + // Split into host + path-prefix, trim a trailing dot from the host, + // re-join. Patterns without a `/` are treated as host-only. + match no_scheme.find('/') { + Some(i) => { + let host = no_scheme[..i].trim_end_matches('.'); + let rest = &no_scheme[i..]; + format!("{}{}", host, rest) + } + None => no_scheme.trim_end_matches('.').to_string(), + } +} + +/// True when `host` matches a `YOUTUBE_RELAY_HOSTS` entry under the +/// same one-directional suffix shape as `host_in_force_mitm_list`. +/// Used at startup to suppress user-supplied `relay_url_patterns` +/// whose host is already covered by the `youtube_via_relay` carve-out +/// — keeping such an entry would re-introduce the +/// `forward_via_sni_rewrite_http` bypass (the path filter would mark +/// non-matching paths as forwarder-eligible) and partially defeat the +/// "full YT through relay" contract the user opted into. +fn host_matches_youtube_relay(host: &str) -> bool { + let h = host.to_ascii_lowercase(); + let h = h.trim_end_matches('.'); + YOUTUBE_RELAY_HOSTS + .iter() + .any(|s| h == *s || h.ends_with(&format!(".{}", s))) +} + +/// True when `host` is served by the Google edge — i.e. matches one of +/// `SNI_REWRITE_SUFFIXES`. Used at startup to validate that +/// `relay_url_patterns` host parts are actually safe targets for the +/// SNI-rewrite HTTP forwarder. One-directional suffix match because we +/// only need to know "would this host be SNI-rewrite-capable in the +/// absence of force_mitm_hosts?" — bidirectional matching would falsely +/// validate sub-suffixes that the SNI list doesn't really cover. +fn host_is_sni_rewrite_capable(host: &str) -> bool { + let h = host.to_ascii_lowercase(); + let h = h.trim_end_matches('.'); + SNI_REWRITE_SUFFIXES + .iter() + .any(|s| h == *s || h.ends_with(&format!(".{}", s))) +} + /// True if `host` matches a known DoH endpoint — either the built-in /// `DEFAULT_DOH_HOSTS` list or a user-supplied entry in `extra`. Match /// is case-insensitive, and entries match either exactly OR as a @@ -497,6 +852,78 @@ impl ProxyServer { fronting_groups.push(Arc::new(resolved)); } + let resolved_routing = ResolvedRouting::from_config(config, mode); + if resolved_routing.exit_node_full_mode_active && !config.youtube_via_relay { + tracing::info!( + "exit_node.mode=full → routing YouTube through relay (upstream commit 88b2767)" + ); + } + if !resolved_routing.relay_url_patterns.is_empty() { + tracing::info!( + "relay_url_patterns: MITM forced on {}; relay only for: {}", + resolved_routing.force_mitm_hosts.join(", "), + resolved_routing.relay_url_patterns.join(", "), + ); + } + if !resolved_routing.skipped_force_mitm_hosts.is_empty() { + tracing::warn!( + "relay_url_patterns: ignoring path-routing for {} — host is not in \ + SNI_REWRITE_SUFFIXES, so the SNI-rewrite forwarder would return a \ + wrong-origin response from the Google edge. Patterns matching this \ + host still route through the relay if the host is reached, but \ + non-matching paths fall back to the regular dispatch.", + resolved_routing.skipped_force_mitm_hosts.join(", "), + ); + } + if !resolved_routing.suppressed_yt_patterns.is_empty() { + tracing::warn!( + "relay_url_patterns: dropped {} — youtube_via_relay (or \ + exit_node.mode=full) routes all YouTube through the relay \ + already, so a YT-host path filter would route non-matching \ + paths through the SNI-rewrite forwarder and partially defeat \ + the full-relay contract. Remove these entries from \ + config.json to silence this warning.", + resolved_routing.suppressed_yt_patterns.join(", "), + ); + } + + // Fronting groups are dispatched BEFORE the force-MITM check + // (`dispatch_tunnel` step 2a vs 2). That precedence is intentional + // — a user adding `youtube.com` to a fronting group is making a + // deliberate "send all of YT through this alternate edge" choice + // and the path filter, which assumes the Google edge handles the + // request, would land at the wrong upstream. But the silent + // override is a footgun if the user didn't realise the two + // features overlap, so surface it at startup with both names + // and the resolved precedence. + for forced in &resolved_routing.force_mitm_hosts { + for g in &fronting_groups { + let overlaps = g.domains_normalized.iter().any(|d| { + forced == d + || forced.ends_with(&format!(".{}", d)) + || d.ends_with(&format!(".{}", forced)) + }); + if overlaps { + tracing::warn!( + "config: fronting group '{}' covers host '{}', which is also \ + in relay_url_patterns. Fronting-group dispatch wins — the \ + path filter will NOT run for this host. Remove the host \ + from the fronting group if you want path-pinned relay routing.", + g.name, + forced, + ); + } + } + } + let ResolvedRouting { + youtube_via_relay_effective, + relay_url_patterns: resolved_patterns, + force_mitm_hosts, + skipped_force_mitm_hosts: _, + suppressed_yt_patterns: _, + exit_node_full_mode_active, + } = resolved_routing; + let rewrite_ctx = Arc::new(RewriteCtx { google_ip: config.google_ip.clone(), front_domain: config.front_domain.clone(), @@ -504,7 +931,10 @@ impl ProxyServer { tls_connector, upstream_socks5: config.upstream_socks5.clone(), mode, - youtube_via_relay: config.youtube_via_relay, + youtube_via_relay: youtube_via_relay_effective, + relay_url_patterns: resolved_patterns, + force_mitm_hosts, + exit_node_full_mode_active, passthrough_hosts: config.passthrough_hosts.clone(), block_quic: config.block_quic, bypass_doh: !config.tunnel_doh, @@ -1590,6 +2020,7 @@ fn should_use_sni_rewrite( host: &str, port: u16, youtube_via_relay: bool, + force_mitm_hosts: &[String], ) -> bool { // The SNI-rewrite path expects TLS from the client: it accepts inbound // TLS, then opens a second TLS connection to the Google edge with a front @@ -1600,9 +2031,19 @@ fn should_use_sni_rewrite( // youtube_via_relay=true removes YouTube suffixes from the match so // YouTube traffic falls through to the Apps Script relay path instead // of the SNI-rewrite tunnel. An explicit hosts override still wins - // over the config toggle. - port == 443 - && (matches_sni_rewrite(host, youtube_via_relay) || hosts_override(hosts, host).is_some()) + // over the config toggle, except for hosts pulled out by + // `relay_url_patterns` — those need MITM for the per-path matcher + // even if the user has a hosts override (the override is still used + // as the upstream IP for the SNI-rewrite forwarder, just not as a + // CONNECT-tunnel target). + if port != 443 { + return false; + } + if host_in_force_mitm_list(host, force_mitm_hosts) { + return false; + } + matches_sni_rewrite(host, youtube_via_relay, force_mitm_hosts) + || hosts_override(hosts, host).is_some() } async fn dispatch_tunnel( @@ -1726,6 +2167,7 @@ async fn dispatch_tunnel( &host, port, rewrite_ctx.youtube_via_relay, + &rewrite_ctx.force_mitm_hosts, ) { tracing::info!( "dispatch {}:{} -> sni-rewrite tunnel (Google edge direct)", @@ -1805,7 +2247,7 @@ async fn dispatch_tunnel( host, port ); - run_mitm_then_relay(sock, &host, port, mitm, &fronter).await; + run_mitm_then_relay(sock, &host, port, mitm, &fronter, rewrite_ctx.clone()).await; return Ok(()); } @@ -1819,7 +2261,7 @@ async fn dispatch_tunnel( port, scheme ); - relay_http_stream_raw(sock, &host, port, scheme, &fronter).await; + relay_http_stream_raw(sock, &host, port, scheme, &fronter, rewrite_ctx.clone()).await; return Ok(()); } @@ -2102,6 +2544,7 @@ async fn run_mitm_then_relay( port: u16, mitm: Arc>, fronter: &DomainFronter, + rewrite_ctx: Arc, ) { // Peek the TLS ClientHello BEFORE minting the MITM cert. When the client // resolves the hostname itself (DoH in Chrome/Firefox) and hands us a raw @@ -2163,7 +2606,16 @@ async fn run_mitm_then_relay( // latter would produce an IP-in-Host request that Cloudflare/etc. reject // outright. loop { - match handle_mitm_request(&mut tls, &effective_host, port, fronter, "https").await { + match handle_mitm_request( + &mut tls, + &effective_host, + port, + fronter, + "https", + &rewrite_ctx, + ) + .await + { Ok(true) => continue, Ok(false) => break, Err(e) => { @@ -2190,9 +2642,10 @@ async fn relay_http_stream_raw( port: u16, scheme: &str, fronter: &DomainFronter, + rewrite_ctx: Arc, ) { loop { - match handle_mitm_request(&mut sock, host, port, fronter, scheme).await { + match handle_mitm_request(&mut sock, host, port, fronter, scheme, &rewrite_ctx).await { Ok(true) => continue, Ok(false) => break, Err(e) => { @@ -2307,6 +2760,201 @@ async fn do_sni_rewrite_tunnel_from_tcp( Ok(()) } +/// Build the HTTP/1.1 request bytes the SNI-rewrite forwarder writes +/// upstream. Pure function — pulled out of `forward_via_sni_rewrite_http` +/// so the request-rebuilding logic can be unit-tested directly without +/// standing up a TLS connector. +/// +/// Forces `Host` to the real origin (the Google edge dispatches by the +/// inner Host even though the outer SNI is sanitised) and +/// `Connection: close` so the upstream signals end-of-response by +/// closing the TCP socket. That lets us read until EOF without parsing +/// HTTP framing on the response side. +/// +/// **Framing-header rewrite**: by the time we run, `read_body` has +/// already decoded any chunked request body into a flat byte buffer. +/// Forwarding the inbound `Transfer-Encoding: chunked` verbatim would +/// leave the upstream waiting forever for chunk markers that aren't in +/// the bytes we send. Strip every framing header (`Transfer-Encoding`, +/// any pre-existing `Content-Length`, the hop-by-hop hints `TE`, +/// `Trailer`, `Upgrade`, plus the connection-management headers +/// `Connection`, `Proxy-Connection`, `Keep-Alive`) and emit a single +/// fresh `Content-Length: ` for any method that +/// can carry a body. The result is a request the upstream can frame +/// unambiguously regardless of how the browser originally framed it. +pub(crate) fn build_sni_forward_request_bytes( + method: &str, + host: &str, + port: u16, + path: &str, + headers: &[(String, String)], + body: &[u8], +) -> Vec { + let host_with_port = if port == 443 || port == 80 { + host.to_string() + } else { + format!("{}:{}", host, port) + }; + let mut req: Vec = Vec::with_capacity(512 + body.len()); + req.extend_from_slice(method.as_bytes()); + req.extend_from_slice(b" "); + req.extend_from_slice(path.as_bytes()); + req.extend_from_slice(b" HTTP/1.1\r\n"); + req.extend_from_slice(b"Host: "); + req.extend_from_slice(host_with_port.as_bytes()); + req.extend_from_slice(b"\r\n"); + req.extend_from_slice(b"Connection: close\r\n"); + // Emit Content-Length whenever we have a body or whenever the method + // is one that semantically carries a body (POST/PUT/PATCH). For body- + // less safe methods like GET/HEAD we omit it — adding `Content-Length: 0` + // is technically valid but some origins read it as "request expects + // a body" which has caused 400s in the past. + let needs_content_length = !body.is_empty() + || method.eq_ignore_ascii_case("POST") + || method.eq_ignore_ascii_case("PUT") + || method.eq_ignore_ascii_case("PATCH"); + if needs_content_length { + req.extend_from_slice(format!("Content-Length: {}\r\n", body.len()).as_bytes()); + } + for (k, v) in headers { + if k.eq_ignore_ascii_case("host") + || k.eq_ignore_ascii_case("connection") + || k.eq_ignore_ascii_case("proxy-connection") + || k.eq_ignore_ascii_case("keep-alive") + || k.eq_ignore_ascii_case("transfer-encoding") + || k.eq_ignore_ascii_case("content-length") + || k.eq_ignore_ascii_case("te") + || k.eq_ignore_ascii_case("trailer") + || k.eq_ignore_ascii_case("upgrade") + { + continue; + } + req.extend_from_slice(k.as_bytes()); + req.extend_from_slice(b": "); + req.extend_from_slice(v.as_bytes()); + req.extend_from_slice(b"\r\n"); + } + req.extend_from_slice(b"\r\n"); + req.extend_from_slice(body); + req +} + +/// Forward an HTTP request via the SNI-rewrite trick at the HTTP layer. +/// +/// Used by `handle_mitm_request` for hosts that were pulled out of +/// SNI-rewrite by `relay_url_patterns` but whose URL path did NOT match +/// any pattern. Saves the Apps Script quota the per-path filter is +/// designed to recover, while still letting matching paths fall through +/// to the relay. +/// +/// Wire mechanics: dial `google_ip:443` (or a `hosts`-overridden IP) with +/// SNI=`front_domain`, then send a literal HTTP/1.1 request whose `Host` +/// header is the *real* origin name. The Google edge dispatches on the +/// inner `Host`, so the response comes from the right backend even though +/// the outer SNI is a sanitised one. `Connection: close` is forced so we +/// can read until EOF and never need to parse `Content-Length` / +/// `Transfer-Encoding` ourselves — and the browser side then sees +/// `Connection: close` and won't pipeline another request on the dead +/// MITM stream. +/// +/// Ported from upstream `_forward_via_sni_rewrite` (commit b3b9220). +async fn forward_via_sni_rewrite_http( + method: &str, + host: &str, + port: u16, + path: &str, + headers: &[(String, String)], + body: &[u8], + rewrite_ctx: &RewriteCtx, +) -> std::io::Result> { + let target_ip = hosts_override(&rewrite_ctx.hosts, host) + .map(|s| s.to_string()) + .unwrap_or_else(|| rewrite_ctx.google_ip.clone()); + let sni = rewrite_ctx.front_domain.clone(); + let server_name = ServerName::try_from(sni.clone()).map_err(|e| { + std::io::Error::new( + std::io::ErrorKind::InvalidInput, + format!("invalid front_domain '{}': {}", sni, e), + ) + })?; + + let upstream_tcp = tokio::time::timeout( + std::time::Duration::from_secs(10), + TcpStream::connect((target_ip.as_str(), port)), + ) + .await + .map_err(|_| std::io::Error::new(std::io::ErrorKind::TimedOut, "upstream connect timeout"))??; + let _ = upstream_tcp.set_nodelay(true); + + let mut tls = rewrite_ctx + .tls_connector + .connect(server_name, upstream_tcp) + .await + .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, format!("tls: {}", e)))?; + + let req = build_sni_forward_request_bytes(method, host, port, path, headers, body); + tls.write_all(&req).await?; + tls.flush().await?; + + // Read response until EOF / ungraceful TLS close. The upstream is + // `Connection: close`, so EOF is a complete response. UnexpectedEof + // (rustls's signal for a TCP close without a close_notify alert) is + // treated the same as a clean EOF — same compromise that + // `read_http_response` makes. + // + // A read timeout means the upstream is hung mid-response and we + // can't prove what we have is complete. Return an error so the + // caller falls back to the relay path; serving a truncated + // response to the browser would silently corrupt it. + // + // **Cap is much tighter than the global 200 MB response ceiling**: + // this code path only runs for hosts in `force_mitm_hosts` AND paths + // that did NOT match a `relay_url_patterns` entry. With the default + // pattern set that's "non-`/youtubei/` GETs on `youtube.com`" — + // realistic responses are HTML pages, JS bundles, and small inline + // assets, capped at a few MB in practice. Cutting the per-call cap + // to 32 MB shrinks the memory blast radius under concurrent load on + // memory-constrained devices (OpenWRT routers, Android) by ~6× vs + // the original 200 MB, while still leaving comfortable headroom + // above the realistic max. Streaming the body straight back to the + // browser would avoid the buffer entirely — see followup TODO; the + // tighter cap is the cheap memory-pressure defense in the meantime. + const MAX_RESP_BYTES: usize = 32 * 1024 * 1024; + let mut response = Vec::with_capacity(16 * 1024); + let mut buf = [0u8; 16 * 1024]; + loop { + let read_res = + tokio::time::timeout(std::time::Duration::from_secs(30), tls.read(&mut buf)).await; + match read_res { + Ok(Ok(0)) => break, + Ok(Ok(n)) => { + response.extend_from_slice(&buf[..n]); + if response.len() > MAX_RESP_BYTES { + return Err(std::io::Error::new( + std::io::ErrorKind::InvalidData, + "sni-rewrite forward response exceeded cap", + )); + } + } + Ok(Err(e)) if e.kind() == std::io::ErrorKind::UnexpectedEof => break, + Ok(Err(e)) => return Err(e), + Err(_) => { + return Err(std::io::Error::new( + std::io::ErrorKind::TimedOut, + "sni-rewrite forward read timeout (response may be truncated)", + )); + } + } + } + if response.is_empty() { + return Err(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + "sni-rewrite forward got empty response", + )); + } + Ok(response) +} + #[derive(Debug)] struct NoVerify; @@ -2370,6 +3018,7 @@ async fn handle_mitm_request( port: u16, fronter: &DomainFronter, scheme: &str, + rewrite_ctx: &Arc, ) -> std::io::Result where S: tokio::io::AsyncRead + tokio::io::AsyncWrite + Unpin, @@ -2493,6 +3142,86 @@ where return Ok(!connection_close); } + // Path-level relay routing (b3b9220). Hosts that were pulled out of + // SNI-rewrite by `relay_url_patterns` are MITM'd so we can inspect the + // URL: paths that match a pattern go through the Apps Script relay + // (this is what fixes YouTube SafeSearch / live-stream gating on + // `/youtubei/`); every other path on the same host is forwarded over + // a fresh SNI-rewrite TLS connection, saving the relay quota that the + // pre-port `youtube_via_relay = true` knob would have spent on every + // static asset. A failed forward falls through to the relay path so a + // network blip on the Google edge doesn't take the host offline. + // + // **Safe-method gate**: the forwarder is only used for GET/HEAD/OPTIONS. + // The fallback-on-error semantics combined with non-idempotent methods + // (POST/PUT/PATCH/DELETE) would be a replay hazard: write_all may + // succeed against the upstream and then a read timeout / cap-exceeded + // / late TLS error fires fallback, which sends the same side-effecting + // request through Apps Script. POSTs to non-`/youtubei/` paths on + // youtube.com are uncommon, and the quota cost of routing them via + // relay is acceptable next to the correctness risk of duplicating + // them. Mirrors the same gate on idempotency that + // `relay_parallel_range` and `parallel_relay` apply elsewhere. + // + // **Exit-node-full gate**: when `exit_node.mode = "full"` is active + // (commit 88b2767), every relay request is required to route through + // the second-hop exit node. The forwarder dials the Google edge + // directly with no awareness of the exit node, so taking it for any + // path — even ones that look "skippable" by the path filter — + // silently bypasses the exit node and breaks the documented "every + // URL routes through the exit node" contract on + // `DomainFronter::exit_node_matches`. With the gate active, + // user-supplied `relay_url_patterns` still pull their hosts out of + // SNI-rewrite (so MITM runs); the path-vs-forwarder split just + // collapses, and every path on those hosts goes to relay → exit + // node. The default `youtube.com/youtubei/` is suppressed earlier + // in `ResolvedRouting` (because `youtube_via_relay_effective` is + // true here), so this only affects user-supplied entries — which + // is the case the reviewer flagged. + let method_is_safe_for_forwarder = method.eq_ignore_ascii_case("GET") + || method.eq_ignore_ascii_case("HEAD") + || method.eq_ignore_ascii_case("OPTIONS"); + if scheme == "https" + && port == 443 + && method_is_safe_for_forwarder + && !rewrite_ctx.exit_node_full_mode_active + && !rewrite_ctx.relay_url_patterns.is_empty() + && host_in_force_mitm_list(host, &rewrite_ctx.force_mitm_hosts) + && !url_matches_relay_pattern(&url, &rewrite_ctx.relay_url_patterns) + { + tracing::info!("sni-rewrite forward {} {}", method, url); + match forward_via_sni_rewrite_http( + &method, + host, + port, + &path, + &headers, + &body, + rewrite_ctx, + ) + .await + { + Ok(response_bytes) => { + stream.write_all(&response_bytes).await?; + stream.flush().await?; + // The forwarder always sets `Connection: close` on the + // upstream request, so the upstream side has closed by + // the time we get here — propagate that to the inbound + // browser side too. The browser will reopen for the next + // request (and we'll mint a new MITM session). + return Ok(false); + } + Err(e) => { + tracing::warn!( + "sni-rewrite forward failed for {}: {} — falling back to relay", + url, + e + ); + // fall through + } + } + } + tracing::info!("relay {} {}", method, url); // For GETs without a body, take the range-parallel path — probes @@ -3233,20 +3962,23 @@ mod tests { fn sni_rewrite_is_only_for_port_443() { let mut hosts = std::collections::HashMap::new(); hosts.insert("example.com".to_string(), "1.2.3.4".to_string()); + let no_force: Vec = vec![]; - assert!(should_use_sni_rewrite(&hosts, "google.com", 443, false)); - assert!(!should_use_sni_rewrite(&hosts, "google.com", 80, false)); + assert!(should_use_sni_rewrite(&hosts, "google.com", 443, false, &no_force)); + assert!(!should_use_sni_rewrite(&hosts, "google.com", 80, false, &no_force)); assert!(should_use_sni_rewrite( &hosts, "www.example.com", 443, - false + false, + &no_force, )); assert!(!should_use_sni_rewrite( &hosts, "www.example.com", 80, - false + false, + &no_force, )); } @@ -3262,18 +3994,20 @@ mod tests { // was incorrectly carved out alongside the API surfaces. // - Non-YouTube Google suffixes are unaffected by the flag. let hosts = std::collections::HashMap::new(); + let no_force: Vec = vec![]; // Default behaviour (flag off): everything in the SNI pool // rewrites including all YouTube assets. - assert!(should_use_sni_rewrite(&hosts, "www.youtube.com", 443, false)); - assert!(should_use_sni_rewrite(&hosts, "i.ytimg.com", 443, false)); - assert!(should_use_sni_rewrite(&hosts, "youtu.be", 443, false)); - assert!(should_use_sni_rewrite(&hosts, "www.google.com", 443, false)); + assert!(should_use_sni_rewrite(&hosts, "www.youtube.com", 443, false, &no_force)); + assert!(should_use_sni_rewrite(&hosts, "i.ytimg.com", 443, false, &no_force)); + assert!(should_use_sni_rewrite(&hosts, "youtu.be", 443, false, &no_force)); + assert!(should_use_sni_rewrite(&hosts, "www.google.com", 443, false, &no_force)); assert!(should_use_sni_rewrite( &hosts, "youtubei.googleapis.com", 443, - false + false, + &no_force, )); // googlevideo.com is INTENTIONALLY NOT in SNI_REWRITE_SUFFIXES @@ -3287,23 +4021,26 @@ mod tests { &hosts, "rr1---sn-abc.googlevideo.com", 443, - false + false, + &no_force, )); // Flag on: only the API + HTML hosts opt out. - assert!(!should_use_sni_rewrite(&hosts, "www.youtube.com", 443, true)); - assert!(!should_use_sni_rewrite(&hosts, "youtu.be", 443, true)); + assert!(!should_use_sni_rewrite(&hosts, "www.youtube.com", 443, true, &no_force)); + assert!(!should_use_sni_rewrite(&hosts, "youtu.be", 443, true, &no_force)); assert!(!should_use_sni_rewrite( &hosts, "www.youtube-nocookie.com", 443, - true + true, + &no_force, )); assert!(!should_use_sni_rewrite( &hosts, "youtubei.googleapis.com", 443, - true + true, + &no_force, )); // Flag on: image / channel-asset CDNs STAY on SNI rewrite. Pre-#275 @@ -3311,20 +4048,21 @@ mod tests { // googlevideo.com still goes through the relay path (not in the // SNI list at all — see note above the SNI_REWRITE_SUFFIXES // entries) so the same flag-on assertion isn't applicable to it. - assert!(should_use_sni_rewrite(&hosts, "i.ytimg.com", 443, true)); - assert!(should_use_sni_rewrite(&hosts, "yt3.ggpht.com", 443, true)); + assert!(should_use_sni_rewrite(&hosts, "i.ytimg.com", 443, true, &no_force)); + assert!(should_use_sni_rewrite(&hosts, "yt3.ggpht.com", 443, true, &no_force)); // Flag on: non-YouTube Google suffixes are unaffected. Note // youtubei.googleapis.com (above) is the *carve-out* — the // broader googleapis.com suffix is NOT carved out, so e.g. // Drive / Calendar / etc. continue to SNI-rewrite. - assert!(should_use_sni_rewrite(&hosts, "www.google.com", 443, true)); - assert!(should_use_sni_rewrite(&hosts, "fonts.gstatic.com", 443, true)); + assert!(should_use_sni_rewrite(&hosts, "www.google.com", 443, true, &no_force)); + assert!(should_use_sni_rewrite(&hosts, "fonts.gstatic.com", 443, true, &no_force)); assert!(should_use_sni_rewrite( &hosts, "drive.googleapis.com", 443, - true + true, + &no_force, )); } @@ -3335,12 +4073,14 @@ mod tests { // user choice, the toggle is a default policy. let mut hosts = std::collections::HashMap::new(); hosts.insert("rr4.googlevideo.com".to_string(), "1.2.3.4".to_string()); + let no_force: Vec = vec![]; assert!(should_use_sni_rewrite( &hosts, "rr4.googlevideo.com", 443, - true + true, + &no_force, )); } @@ -3587,6 +4327,713 @@ mod tests { assert!(match_fronting_group("vercel.com", &groups).is_none()); } + // ── SNI-rewrite forwarder request builder (b3b9220) ────────────────── + + fn parse_request(req: &[u8]) -> (String, Vec<(String, String)>, Vec) { + let s = std::str::from_utf8(req).expect("request bytes must be utf-8"); + let mut parts = s.split("\r\n\r\n"); + let head = parts.next().unwrap(); + let body_start = head.len() + 4; + let body = req[body_start..].to_vec(); + let mut lines = head.split("\r\n"); + let request_line = lines.next().unwrap().to_string(); + let mut headers = Vec::new(); + for line in lines { + if line.is_empty() { + continue; + } + let (k, v) = line.split_once(": ").expect("malformed header line"); + headers.push((k.to_string(), v.to_string())); + } + (request_line, headers, body) + } + + fn header_present(headers: &[(String, String)], name: &str) -> bool { + headers.iter().any(|(k, _)| k.eq_ignore_ascii_case(name)) + } + + fn header_get_raw<'a>(headers: &'a [(String, String)], name: &str) -> Option<&'a str> { + headers + .iter() + .find(|(k, _)| k.eq_ignore_ascii_case(name)) + .map(|(_, v)| v.as_str()) + } + + #[test] + fn forwarder_request_get_emits_correct_request_line_and_host() { + let req = build_sni_forward_request_bytes( + "GET", + "www.youtube.com", + 443, + "/watch?v=abc", + &[("User-Agent".into(), "Mozilla/5.0".into())], + b"", + ); + let (line, headers, body) = parse_request(&req); + assert_eq!(line, "GET /watch?v=abc HTTP/1.1"); + assert_eq!(header_get_raw(&headers, "Host"), Some("www.youtube.com")); + assert_eq!(header_get_raw(&headers, "Connection"), Some("close")); + assert_eq!(header_get_raw(&headers, "User-Agent"), Some("Mozilla/5.0")); + // GET without body must not emit Content-Length. + assert!( + !header_present(&headers, "Content-Length"), + "GET with no body must not emit Content-Length" + ); + assert!(body.is_empty()); + } + + #[test] + fn forwarder_request_strips_inbound_chunked_and_sets_fresh_content_length() { + // `read_body` decodes chunked request bodies before they reach the + // forwarder, so the Transfer-Encoding header is a lie about the + // bytes we have. The builder MUST drop it AND any inbound + // Content-Length, then emit a single fresh Content-Length matching + // the decoded body length. Otherwise the upstream waits forever + // for chunk markers that aren't there (or reads the wrong number + // of bytes). + let body = b"hello-decoded-body"; + let req = build_sni_forward_request_bytes( + "POST", + "example.com", + 443, + "/api", + &[ + ("Transfer-Encoding".into(), "chunked".into()), + ("Content-Length".into(), "999".into()), // stale lie + ("Content-Type".into(), "application/json".into()), + ], + body, + ); + let (_line, headers, parsed_body) = parse_request(&req); + assert!( + !header_present(&headers, "Transfer-Encoding"), + "Transfer-Encoding must be stripped: {:?}", + headers + ); + assert_eq!( + header_get_raw(&headers, "Content-Length"), + Some(body.len().to_string().as_str()), + "Content-Length must reflect actual body length" + ); + // Make sure there is exactly ONE Content-Length header. + let cl_count = headers + .iter() + .filter(|(k, _)| k.eq_ignore_ascii_case("Content-Length")) + .count(); + assert_eq!(cl_count, 1, "must emit exactly one Content-Length header"); + // Non-framing headers like Content-Type pass through. + assert_eq!( + header_get_raw(&headers, "Content-Type"), + Some("application/json") + ); + assert_eq!(parsed_body, body); + } + + #[test] + fn forwarder_request_drops_hop_by_hop_and_connection_headers() { + let req = build_sni_forward_request_bytes( + "GET", + "www.youtube.com", + 443, + "/", + &[ + ("Connection".into(), "keep-alive".into()), + ("Proxy-Connection".into(), "keep-alive".into()), + ("Keep-Alive".into(), "timeout=5".into()), + ("TE".into(), "trailers".into()), + ("Trailer".into(), "X-Foo".into()), + ("Upgrade".into(), "websocket".into()), + ("Host".into(), "spoofed.example.com".into()), // must be overwritten + ("Accept".into(), "text/html".into()), + ], + b"", + ); + let (_line, headers, _body) = parse_request(&req); + // Forced headers we own. + assert_eq!(header_get_raw(&headers, "Host"), Some("www.youtube.com")); + assert_eq!(header_get_raw(&headers, "Connection"), Some("close")); + // None of the inbound copies of the headers we own may pass through. + let host_count = headers + .iter() + .filter(|(k, _)| k.eq_ignore_ascii_case("Host")) + .count(); + assert_eq!(host_count, 1, "must emit exactly one Host header"); + // Hop-by-hop must be dropped. + assert!(!header_present(&headers, "Proxy-Connection")); + assert!(!header_present(&headers, "Keep-Alive")); + assert!(!header_present(&headers, "TE")); + assert!(!header_present(&headers, "Trailer")); + assert!(!header_present(&headers, "Upgrade")); + // Non-framing pass through. + assert_eq!(header_get_raw(&headers, "Accept"), Some("text/html")); + } + + #[test] + fn forwarder_request_includes_port_in_host_for_nondefault_ports() { + let req = build_sni_forward_request_bytes( + "GET", + "youtube.com", + 8443, + "/", + &[], + b"", + ); + let (_line, headers, _body) = parse_request(&req); + assert_eq!(header_get_raw(&headers, "Host"), Some("youtube.com:8443")); + } + + #[test] + fn forwarder_request_post_with_empty_body_still_emits_content_length() { + // POSTs may legitimately have no body, but origins generally + // expect Content-Length: 0 on a body-bearing method. The + // get/head/options branch is the one that omits CL. + let req = build_sni_forward_request_bytes( + "POST", + "youtube.com", + 443, + "/youtubei/v1/no-body", + &[], + b"", + ); + let (_line, headers, _body) = parse_request(&req); + assert_eq!(header_get_raw(&headers, "Content-Length"), Some("0")); + } + + // ── normalize_pattern ───────────────────────────────────────────────── + + #[test] + fn normalize_pattern_strips_scheme_case_insensitively() { + // The original implementation lowercased AFTER trim_start_matches, + // so `HTTPS://Foo.com/` slipped through with the scheme intact. + // Now we lowercase first. + assert_eq!( + normalize_pattern("HTTPS://YouTube.com/YouTubei/"), + "youtube.com/youtubei/" + ); + assert_eq!( + normalize_pattern("HTTP://Example.com/api/"), + "example.com/api/" + ); + // Bare patterns (no scheme) lower-cased. + assert_eq!( + normalize_pattern("YouTube.com/YouTubei/"), + "youtube.com/youtubei/" + ); + } + + #[test] + fn normalize_pattern_trims_trailing_dot_on_host() { + // FQDN-form host with trailing dot must canonicalise to the same + // form `extract_host` returns (it trims the dot). + assert_eq!( + normalize_pattern("youtube.com./youtubei/"), + "youtube.com/youtubei/" + ); + assert_eq!( + normalize_pattern("https://YouTube.com./api/"), + "youtube.com/api/" + ); + // Trailing dot on host-only patterns (no path) too. + assert_eq!(normalize_pattern("foo.com."), "foo.com"); + } + + #[test] + fn normalize_pattern_preserves_path_dots() { + // Only the host component gets its trailing dot stripped — path + // components keep theirs (a path like `/v1.0/` is legitimate). + assert_eq!( + normalize_pattern("youtube.com/v1.0/"), + "youtube.com/v1.0/" + ); + assert_eq!( + normalize_pattern("youtube.com./v1.0/"), + "youtube.com/v1.0/" + ); + } + + #[test] + fn normalize_pattern_handles_whitespace() { + assert_eq!( + normalize_pattern(" youtube.com/youtubei/ "), + "youtube.com/youtubei/" + ); + } + + // ── host_is_sni_rewrite_capable ────────────────────────────────────── + + #[test] + fn sni_capable_recognises_google_edge_hosts() { + // SNI_REWRITE_SUFFIXES coverage check. + assert!(host_is_sni_rewrite_capable("youtube.com")); + assert!(host_is_sni_rewrite_capable("www.youtube.com")); + assert!(host_is_sni_rewrite_capable("studio.youtube.com")); + assert!(host_is_sni_rewrite_capable("googleapis.com")); + assert!(host_is_sni_rewrite_capable("youtubei.googleapis.com")); + assert!(host_is_sni_rewrite_capable("YouTube.COM")); // case insensitive + assert!(host_is_sni_rewrite_capable("youtube.com.")); // trailing dot + } + + #[test] + fn sni_capable_rejects_non_google_hosts() { + // The whole point of the check: don't let users pull non-Google + // hosts through the SNI-rewrite forwarder, which would return + // wrong-origin responses from the Google edge. + assert!(!host_is_sni_rewrite_capable("evilsite.com")); + assert!(!host_is_sni_rewrite_capable("googlevideo.com")); // not in list + assert!(!host_is_sni_rewrite_capable("api.example.com")); + // Suffix-attack: "x" + matching suffix must not pass. + assert!(!host_is_sni_rewrite_capable("notyoutube.com")); + // Empty / pathological input. + assert!(!host_is_sni_rewrite_capable("")); + } + + #[test] + fn resolved_routing_skips_non_sni_capable_user_pattern_hosts() { + // Direct test of the wrong-origin defense: a user-supplied + // pattern targeting a non-Google host must NOT add to + // `force_mitm_hosts`, because the forwarder would dial Google's + // edge and return a wrong-origin response. The pattern itself + // is preserved in `relay_url_patterns` so a matching path still + // routes via relay if the host is reached through the regular + // TLS-detect → MITM → relay path. + // + // Uses `googleapis.com/api/` as the SNI-capable example — + // intentionally NOT a YT-family host, so the + // `youtube_via_relay`-driven YT-suppression doesn't drop it. + // youtube_via_relay is left off here so the SNI-capable filter + // is the only thing being exercised. + let s = r#"{ + "mode": "apps_script", + "auth_key": "secret-test-secret-test", + "script_id": "X", + "relay_url_patterns": [ + "evilsite.com/api/", + "googleapis.com/inner/" + ] + }"#; + let cfg: crate::config::Config = serde_json::from_str(s).unwrap(); + let r = ResolvedRouting::from_config(&cfg, Mode::AppsScript); + // Pattern preserved. + assert!(r + .relay_url_patterns + .contains(&"evilsite.com/api/".to_string())); + assert!(r + .relay_url_patterns + .contains(&"googleapis.com/inner/".to_string())); + // Non-Google host filtered out of force_mitm_hosts. + assert!( + !r.force_mitm_hosts.contains(&"evilsite.com".to_string()), + "evilsite.com must not be force-MITM'd: {:?}", + r.force_mitm_hosts, + ); + // Google-edge host kept. + assert!(r + .force_mitm_hosts + .contains(&"googleapis.com".to_string())); + // And the skip is surfaced for the startup warning. + assert!(r + .skipped_force_mitm_hosts + .contains(&"evilsite.com".to_string())); + } + + // ── Regression: exit_node.mode=full + user pattern ────────────────── + + #[test] + fn youtube_via_relay_drops_user_supplied_yt_patterns() { + // Critical: when youtube_via_relay is on, every YT request goes + // through the relay via the YOUTUBE_RELAY_HOSTS carve-out, so a + // user-supplied `youtube.com/youtubei/` pattern is redundant + // AND harmful — it would re-add youtube.com to force_mitm_hosts + // and the path filter would then route non-matching paths + // through `forward_via_sni_rewrite_http`, partially defeating + // the user's "full YT through relay" opt-in. Dropped at startup + // with a warning. + let s = r#"{ + "mode": "apps_script", + "auth_key": "secret-test-secret-test", + "script_id": "X", + "youtube_via_relay": true, + "relay_url_patterns": [ + "youtube.com/youtubei/", + "www.youtube.com/watch", + "googleapis.com/specific-api/" + ] + }"#; + let cfg: crate::config::Config = serde_json::from_str(s).unwrap(); + let r = ResolvedRouting::from_config(&cfg, Mode::AppsScript); + // Both YT-host entries dropped; non-YT entry survives. + assert!( + !r.relay_url_patterns + .iter() + .any(|p| p.starts_with("youtube.com/")), + "youtube.com/* must be dropped: {:?}", + r.relay_url_patterns, + ); + assert!( + !r.relay_url_patterns + .iter() + .any(|p| p.starts_with("www.youtube.com/")), + "www.youtube.com/* must be dropped: {:?}", + r.relay_url_patterns, + ); + assert!(r + .relay_url_patterns + .contains(&"googleapis.com/specific-api/".to_string())); + // youtube.com NOT in force_mitm_hosts (would reactivate the path + // filter); googleapis.com IS. + assert!(!r.force_mitm_hosts.contains(&"youtube.com".to_string())); + assert!(!r.force_mitm_hosts.contains(&"www.youtube.com".to_string())); + assert!(r + .force_mitm_hosts + .contains(&"googleapis.com".to_string())); + // Suppressed list surfaces both for the startup warning. + assert!(r + .suppressed_yt_patterns + .contains(&"youtube.com/youtubei/".to_string())); + assert!(r + .suppressed_yt_patterns + .contains(&"www.youtube.com/watch".to_string())); + } + + #[test] + fn youtube_via_relay_off_keeps_user_supplied_yt_patterns() { + // Sanity check the inverse: when youtube_via_relay is off, user + // YT patterns should remain (the path filter is the whole point + // of relay_url_patterns when YT isn't fully relayed). + let s = r#"{ + "mode": "apps_script", + "auth_key": "secret-test-secret-test", + "script_id": "X", + "relay_url_patterns": ["youtube.com/youtubei/v2/"] + }"#; + let cfg: crate::config::Config = serde_json::from_str(s).unwrap(); + let r = ResolvedRouting::from_config(&cfg, Mode::AppsScript); + assert!(r.suppressed_yt_patterns.is_empty()); + // User pattern is in the resolved list (alongside the default). + assert!(r + .relay_url_patterns + .contains(&"youtube.com/youtubei/v2/".to_string())); + assert!(r.force_mitm_hosts.contains(&"youtube.com".to_string())); + } + + #[test] + fn exit_node_full_also_drops_user_supplied_yt_patterns() { + // Belt-and-suspenders: in exit-node-full mode, the runtime + // forwarder gate already blocks bypass, but + // youtube_via_relay_effective is true and the same suppression + // logic applies. A user-supplied YT pattern would be dropped + // here too, which is fine — the exit-node-full contract makes + // it a no-op anyway. + let s = r#"{ + "mode": "apps_script", + "auth_key": "secret-test-secret-test", + "script_id": "X", + "relay_url_patterns": ["youtube.com/youtubei/"], + "exit_node": { + "enabled": true, + "relay_url": "https://exit.example.com/relay", + "psk": "shared-psk-1234", + "mode": "full" + } + }"#; + let cfg: crate::config::Config = serde_json::from_str(s).unwrap(); + let r = ResolvedRouting::from_config(&cfg, Mode::AppsScript); + assert!(r.youtube_via_relay_effective); + assert!(r.exit_node_full_mode_active); + assert!(r + .suppressed_yt_patterns + .contains(&"youtube.com/youtubei/".to_string())); + assert!(!r.force_mitm_hosts.contains(&"youtube.com".to_string())); + } + + #[test] + fn host_matches_youtube_relay_one_directional() { + // Same shape as host_in_force_mitm_list — exact match or + // dot-anchored subdomain. + assert!(host_matches_youtube_relay("youtube.com")); + assert!(host_matches_youtube_relay("www.youtube.com")); + assert!(host_matches_youtube_relay("studio.youtube.com")); + assert!(host_matches_youtube_relay("youtu.be")); + assert!(host_matches_youtube_relay("youtube-nocookie.com")); + assert!(host_matches_youtube_relay("youtubei.googleapis.com")); + assert!(host_matches_youtube_relay("v1.youtubei.googleapis.com")); + // Case-insensitive + trailing dot. + assert!(host_matches_youtube_relay("YouTube.com")); + assert!(host_matches_youtube_relay("youtube.com.")); + // Sibling subdomains of the parent SNI suffix don't match. + assert!(!host_matches_youtube_relay("drive.googleapis.com")); + // Substring attack must not match. + assert!(!host_matches_youtube_relay("notyoutube.com")); + assert!(!host_matches_youtube_relay("youtube.com.evil.test")); + } + + #[test] + fn exit_node_full_mode_active_propagates_through_resolved_routing() { + // The flag must round-trip from config to ResolvedRouting so + // RewriteCtx can carry it to handle_mitm_request and gate the + // SNI-HTTP forwarder. Selective-mode exit-nodes don't set it. + let s = r#"{ + "mode": "apps_script", + "auth_key": "secret-test-secret-test", + "script_id": "X", + "exit_node": { + "enabled": true, + "relay_url": "https://exit.example.com/relay", + "psk": "shared-psk-1234", + "mode": "full" + } + }"#; + let cfg: crate::config::Config = serde_json::from_str(s).unwrap(); + let r = ResolvedRouting::from_config(&cfg, Mode::AppsScript); + assert!(r.exit_node_full_mode_active); + + // Same config but in selective mode → flag NOT set. + let s = r#"{ + "mode": "apps_script", + "auth_key": "secret-test-secret-test", + "script_id": "X", + "exit_node": { + "enabled": true, + "relay_url": "https://exit.example.com/relay", + "psk": "shared-psk-1234", + "mode": "selective", + "hosts": ["chatgpt.com"] + } + }"#; + let cfg: crate::config::Config = serde_json::from_str(s).unwrap(); + let r = ResolvedRouting::from_config(&cfg, Mode::AppsScript); + assert!(!r.exit_node_full_mode_active); + } + + #[test] + fn exit_node_full_keeps_user_patterns_for_relay_routing() { + // Critical correctness invariant: in exit_node.mode=full, a + // user's `relay_url_patterns` entry must NOT cause non-matching + // paths on its host to bypass the exit node. Two halves to the + // contract: + // 1. The user's pattern host is still pulled into + // `force_mitm_hosts` so MITM runs and the in-relay + // `exit_node_matches` can route through the second hop. + // 2. `exit_node_full_mode_active` is true so dispatch knows + // to skip the SNI-HTTP forwarder for non-matching paths, + // sending them to relay → exit node instead of bypassing + // both via the Google edge. + let s = r#"{ + "mode": "apps_script", + "auth_key": "secret-test-secret-test", + "script_id": "X", + "relay_url_patterns": ["googleapis.com/specific-api/"], + "exit_node": { + "enabled": true, + "relay_url": "https://exit.example.com/relay", + "psk": "shared-psk-1234", + "mode": "full" + } + }"#; + let cfg: crate::config::Config = serde_json::from_str(s).unwrap(); + let r = ResolvedRouting::from_config(&cfg, Mode::AppsScript); + + // The user pattern survives — they want googleapis.com to be + // MITM'd and routed via relay (which then routes through exit + // node by the full-mode contract). + assert_eq!( + r.relay_url_patterns, + vec!["googleapis.com/specific-api/".to_string()] + ); + assert_eq!(r.force_mitm_hosts, vec!["googleapis.com".to_string()]); + // The default `youtube.com/youtubei/` is correctly suppressed + // because youtube_via_relay_effective is true via exit-node-full. + assert!(!r + .relay_url_patterns + .iter() + .any(|p| p.starts_with("youtube.com/youtubei/"))); + // And the runtime gate fires. + assert!(r.exit_node_full_mode_active); + assert!(r.youtube_via_relay_effective); + } + + #[test] + fn forwarder_dispatch_gate_off_when_exit_node_full() { + // RewriteCtx-level invariant: with exit_node_full_mode_active, + // the gate that decides whether to use the forwarder must be + // observably off — even when every other condition would + // dispatch through it. + // Reconstruct the gate logic that lives in handle_mitm_request, + // since pulling a real RewriteCtx through the test requires an + // I/O-bound DomainFronter. + let force_mitm_hosts = vec!["googleapis.com".to_string()]; + let patterns = vec!["googleapis.com/specific-api/".to_string()]; + let url = "https://api.googleapis.com/other-path"; + let host = "api.googleapis.com"; + let port = 443u16; + let scheme = "https"; + let method = "GET"; + + let method_safe = method.eq_ignore_ascii_case("GET") + || method.eq_ignore_ascii_case("HEAD") + || method.eq_ignore_ascii_case("OPTIONS"); + + // Without the exit-node-full gate, every other condition would + // dispatch through the forwarder. + let pre_gate = scheme == "https" + && port == 443 + && method_safe + && !patterns.is_empty() + && host_in_force_mitm_list(host, &force_mitm_hosts) + && !url_matches_relay_pattern(url, &patterns); + assert!(pre_gate, "test fixture must reach the forwarder gate"); + + // With exit_node_full_mode_active = true, the actual gate is off. + let exit_node_full_mode_active = true; + let actual_gate = scheme == "https" + && port == 443 + && method_safe + && !exit_node_full_mode_active + && !patterns.is_empty() + && host_in_force_mitm_list(host, &force_mitm_hosts) + && !url_matches_relay_pattern(url, &patterns); + assert!( + !actual_gate, + "exit_node.mode=full must disable the forwarder dispatch even \ + when host/path/method would otherwise route through it", + ); + } + + // ── Regression: trailing-dot URL hosts ──────────────────────────────── + + #[test] + fn url_matches_relay_pattern_trims_trailing_dot_on_url_host() { + // `host_in_force_mitm_list` trims trailing dots, so dispatch + // would force-MITM a `www.youtube.com.` request. Without the + // matching trim here, the URL-host-vs-pattern-host suffix + // check failed and `/youtubei/v1/...` would route through the + // SNI-HTTP forwarder instead of the relay — observable as + // SafeSearch staying sticky after a system that emits FQDN + // hostnames (some Linux DNS resolvers, browser DoH paths) hits + // YouTube. + let patterns = vec!["youtube.com/youtubei/".to_string()]; + assert!(url_matches_relay_pattern( + "https://www.youtube.com./youtubei/v1/browse", + &patterns, + )); + assert!(url_matches_relay_pattern( + "https://youtube.com./youtubei/", + &patterns, + )); + } + + #[test] + fn url_matches_relay_pattern_strips_authority_port() { + // Same canonicalisation: an authority with `:443` must match + // pattern hosts that don't include the default port. Otherwise + // the host-vs-pattern compare fails and the dispatcher treats + // the URL as non-matching → forwarder dispatch. + let patterns = vec!["youtube.com/youtubei/".to_string()]; + assert!(url_matches_relay_pattern( + "https://www.youtube.com:443/youtubei/v1/browse", + &patterns, + )); + // Non-default port still match — the URL went through some + // explicit-port flow; the host part is what matters. + assert!(url_matches_relay_pattern( + "https://www.youtube.com:8443/youtubei/v1/browse", + &patterns, + )); + } + + #[test] + fn dispatch_matchers_agree_under_trailing_dot() { + // End-to-end check: same input must lead to the same + // membership decision in both matchers, otherwise the dispatch + // and pattern-check layers disagree (the symptom the reviewer + // flagged: host force-MITM'd but URL-pattern check fails). + let force = vec!["youtube.com".to_string()]; + let patterns = vec!["youtube.com/youtubei/".to_string()]; + for variant in [ + "www.youtube.com", + "www.youtube.com.", + "WWW.YouTube.COM", + "WWW.YouTube.COM.", + ] { + assert!(host_in_force_mitm_list(variant, &force), "{}", variant); + let url = format!("https://{}/youtubei/v1/browse", variant); + assert!(url_matches_relay_pattern(&url, &patterns), "{}", url); + } + } + + // ── fronting_groups precedence ─────────────────────────────────────── + + #[test] + fn fronting_group_overlap_with_relay_pattern_resolves_dispatch_via_group() { + // Documented precedence: dispatch_tunnel checks fronting_groups + // BEFORE force_mitm_hosts (steps 2a vs 2 in dispatch_tunnel). + // A user adding `youtube.com` to a fronting group is making a + // deliberate "alternate edge for YT" choice; the path filter + // assumes the Google edge handles the request and would land + // at the wrong upstream if it ran. The override is intentional; + // this test pins it so a future refactor doesn't accidentally + // flip the precedence. + let s = r#"{ + "mode": "apps_script", + "auth_key": "secret-test-secret-test", + "script_id": "X", + "fronting_groups": [{ + "name": "alt-yt-edge", + "ip": "203.0.113.10", + "sni": "react.dev", + "domains": ["youtube.com"] + }] + }"#; + let cfg: crate::config::Config = serde_json::from_str(s).unwrap(); + // ResolvedRouting still includes the default pattern — patterns + // are mode-gated, not fronting-group-gated. The actual override + // happens at dispatch time. + let r = ResolvedRouting::from_config(&cfg, Mode::AppsScript); + assert!(r + .relay_url_patterns + .contains(&"youtube.com/youtubei/".to_string())); + + // Build the resolved fronting group and confirm + // `match_fronting_group` returns it for the YT host. This is + // the call dispatch_tunnel uses at step 2a, BEFORE the force-MITM + // check at step 2 — the YT request never reaches the path filter. + let group = + FrontingGroupResolved::from_config(&cfg.fronting_groups[0]).unwrap(); + let groups = vec![Arc::new(group)]; + assert!(match_fronting_group("www.youtube.com", &groups).is_some()); + assert!(match_fronting_group("youtube.com", &groups).is_some()); + } + + #[test] + fn fronting_group_with_disjoint_domain_does_not_interfere() { + // Sanity check: a fronting group covering an unrelated host + // (vercel.com) does not affect the YT path filter. Guards + // against accidentally widening the precedence rule. + let s = r#"{ + "mode": "apps_script", + "auth_key": "secret-test-secret-test", + "script_id": "X", + "fronting_groups": [{ + "name": "vercel", + "ip": "76.76.21.21", + "sni": "react.dev", + "domains": ["vercel.com"] + }] + }"#; + let cfg: crate::config::Config = serde_json::from_str(s).unwrap(); + let r = ResolvedRouting::from_config(&cfg, Mode::AppsScript); + // YT pattern survives untouched. + assert!(r + .relay_url_patterns + .contains(&"youtube.com/youtubei/".to_string())); + + let group = + FrontingGroupResolved::from_config(&cfg.fronting_groups[0]).unwrap(); + let groups = vec![Arc::new(group)]; + // YT host doesn't match the unrelated group. + assert!(match_fronting_group("www.youtube.com", &groups).is_none()); + } + #[test] fn fronting_group_resolve_rejects_invalid_sni() { let bad = FrontingGroup { @@ -3597,4 +5044,417 @@ mod tests { }; assert!(FrontingGroupResolved::from_config(&bad).is_err()); } + + #[test] + fn url_matches_relay_pattern_basic() { + // Default upstream pattern. Path-anchored — matches the + // youtubei prefix, NOT a similarly-named query string. + let patterns = vec!["youtube.com/youtubei/".to_string()]; + assert!(url_matches_relay_pattern( + "https://www.youtube.com/youtubei/v1/browse", + &patterns, + )); + assert!(url_matches_relay_pattern( + "https://m.youtube.com/youtubei/v1/player", + &patterns, + )); + // Bare scheme variant + assert!(url_matches_relay_pattern( + "http://youtube.com/youtubei/", + &patterns, + )); + // Wrong path on the right host + assert!(!url_matches_relay_pattern( + "https://www.youtube.com/watch?v=abc", + &patterns, + )); + // Right path-shape on the wrong host + assert!(!url_matches_relay_pattern( + "https://example.com/youtubei/v1", + &patterns, + )); + // Suffix attack — trailing dot on host should not bypass match. + // (URL parsing strips the trailing dot before reaching here in + // practice; the matcher is strict on the host segment.) + assert!(!url_matches_relay_pattern( + "https://evil-youtube.com/youtubei/", + &patterns, + )); + } + + #[test] + fn url_matches_relay_pattern_empty_patterns_never_matches() { + let empty: Vec = vec![]; + assert!(!url_matches_relay_pattern("https://www.youtube.com/", &empty)); + } + + #[test] + fn host_in_force_mitm_list_is_suffix_anchored() { + let list = vec!["youtube.com".to_string()]; + assert!(host_in_force_mitm_list("youtube.com", &list)); + assert!(host_in_force_mitm_list("www.youtube.com", &list)); + assert!(host_in_force_mitm_list("m.youtube.com", &list)); + // Strict suffix — trailing-dot trim should still match. + assert!(host_in_force_mitm_list("youtube.com.", &list)); + // Substring attack must NOT match. + assert!(!host_in_force_mitm_list("notyoutube.com", &list)); + assert!(!host_in_force_mitm_list("youtube.com.evil.test", &list)); + // Empty list never matches. + let empty: Vec = vec![]; + assert!(!host_in_force_mitm_list("anything", &empty)); + } + + #[test] + fn force_mitm_pulls_host_out_of_sni_rewrite() { + // With `relay_url_patterns: ["youtube.com/youtubei/"]`, the host + // youtube.com gets pulled out of SNI-rewrite so MITM can run + // and inspect paths. Other YT-family hosts (ytimg, ggpht) stay + // on SNI-rewrite — they aren't in the patterns and the user + // hasn't asked for path-level routing on them. + let hosts = std::collections::HashMap::new(); + let force = vec!["youtube.com".to_string()]; + + // youtube.com itself is force-MITM'd → not SNI-rewrite. + assert!(!should_use_sni_rewrite( + &hosts, + "www.youtube.com", + 443, + false, + &force, + )); + assert!(!should_use_sni_rewrite( + &hosts, + "m.youtube.com", + 443, + false, + &force, + )); + // Sibling YT hosts NOT in the force list still SNI-rewrite. + assert!(should_use_sni_rewrite( + &hosts, + "i.ytimg.com", + 443, + false, + &force, + )); + assert!(should_use_sni_rewrite( + &hosts, + "yt3.ggpht.com", + 443, + false, + &force, + )); + } + + #[test] + fn force_mitm_overrides_hosts_override() { + // If the user has both an explicit hosts override AND a relay_url_patterns + // entry that pulls the same host out of SNI-rewrite, the pattern wins — + // we need MITM for the per-path matcher to run. The hosts override is + // still used as the upstream IP by `forward_via_sni_rewrite_http` / + // `do_sni_rewrite_tunnel_from_tcp`, just not as a CONNECT-tunnel target. + let mut hosts = std::collections::HashMap::new(); + hosts.insert("www.youtube.com".to_string(), "1.2.3.4".to_string()); + let force = vec!["youtube.com".to_string()]; + + assert!(!should_use_sni_rewrite( + &hosts, + "www.youtube.com", + 443, + false, + &force, + )); + } + + fn make_test_config(mode: &str) -> crate::config::Config { + let s = format!( + r#"{{ + "mode": "{mode}", + "auth_key": "secret-test-secret-test", + "script_id": "X" + }}"#, + ); + serde_json::from_str(&s).unwrap() + } + + #[test] + fn resolved_routing_apps_script_default_prepends_youtubei_pattern() { + // The default-shipped pattern is `youtube.com/youtubei/`. With no + // user config and no exit node, apps_script mode should resolve + // exactly that one pattern and pull `youtube.com` from + // SNI-rewrite (so MITM can run for path inspection). + let cfg = make_test_config("apps_script"); + let r = ResolvedRouting::from_config(&cfg, Mode::AppsScript); + assert_eq!(r.relay_url_patterns, vec!["youtube.com/youtubei/".to_string()]); + assert_eq!(r.force_mitm_hosts, vec!["youtube.com".to_string()]); + assert!(!r.youtube_via_relay_effective); + assert!(!r.exit_node_full_mode_active); + } + + #[test] + fn resolved_routing_direct_mode_skips_default_pattern() { + // CRITICAL regression guard. In direct mode there is no + // Apps Script relay path. The `youtube.com/youtubei/` default + // would pull `youtube.com` from SNI-rewrite, and the dispatcher + // would then send YT requests to RAW TCP fallback because nothing + // would match SNI-rewrite OR Apps Script. Test asserts that + // direct mode resolves to empty pattern + force-MITM lists. + let cfg = make_test_config("direct"); + let r = ResolvedRouting::from_config(&cfg, Mode::Direct); + assert!( + r.relay_url_patterns.is_empty(), + "direct mode must not populate relay_url_patterns: {:?}", + r.relay_url_patterns, + ); + assert!( + r.force_mitm_hosts.is_empty(), + "direct mode must not populate force_mitm_hosts: {:?}", + r.force_mitm_hosts, + ); + } + + #[test] + fn resolved_routing_full_mode_skips_default_pattern() { + // Mode::Full's dispatcher short-circuits to the tunnel mux + // before MITM runs, so patterns would never be consulted — + // resolving them is dead weight. Same gate as direct mode. + let cfg = make_test_config("full"); + let r = ResolvedRouting::from_config(&cfg, Mode::Full); + assert!(r.relay_url_patterns.is_empty()); + assert!(r.force_mitm_hosts.is_empty()); + } + + #[test] + fn resolved_routing_direct_mode_youtube_still_sni_rewrites() { + // End-to-end check of the direct-mode regression: with the + // resolved sets empty, `should_use_sni_rewrite` should send + // www.youtube.com:443 to the SNI-rewrite tunnel, not raw-TCP + // fallback. + let cfg = make_test_config("direct"); + let r = ResolvedRouting::from_config(&cfg, Mode::Direct); + let hosts = std::collections::HashMap::new(); + assert!(should_use_sni_rewrite( + &hosts, + "www.youtube.com", + 443, + r.youtube_via_relay_effective, + &r.force_mitm_hosts, + )); + } + + #[test] + fn resolved_routing_youtube_via_relay_skips_default_pattern() { + // When the user explicitly opts in to `youtube_via_relay = true`, + // YouTube is fully relayed already — the per-path filter is + // redundant. User extras still resolve, just not the default. + // The user pattern host MUST be SNI-rewrite-capable to land in + // `force_mitm_hosts`; here we use `googleapis.com` since it's + // in `SNI_REWRITE_SUFFIXES`. + let s = r#"{ + "mode": "apps_script", + "auth_key": "secret-test-secret-test", + "script_id": "X", + "youtube_via_relay": true, + "relay_url_patterns": ["googleapis.com/api/"] + }"#; + let cfg: crate::config::Config = serde_json::from_str(s).unwrap(); + let r = ResolvedRouting::from_config(&cfg, Mode::AppsScript); + // Default `youtube.com/youtubei/` NOT prepended; user entry kept. + assert_eq!(r.relay_url_patterns, vec!["googleapis.com/api/".to_string()]); + assert_eq!(r.force_mitm_hosts, vec!["googleapis.com".to_string()]); + assert!(r.skipped_force_mitm_hosts.is_empty()); + assert!(r.youtube_via_relay_effective); + } + + #[test] + fn resolved_routing_exit_node_full_mode_skips_default_pattern() { + // CRITICAL regression guard. With `exit_node.mode = "full"` + // and `youtube_via_relay = false`, the prior code prepended + // `youtube.com/youtubei/` even though the YT-via-relay flag + // was effectively true. That made non-`/youtubei/` YouTube + // requests route through `forward_via_sni_rewrite_http`, + // bypassing `DomainFronter::relay` and with it the exit node + // — defeating the whole point of full mode. Now the effective + // flag gates the prepend, and YT goes fully through relay + // (and thus through the exit node). + let s = r#"{ + "mode": "apps_script", + "auth_key": "secret-test-secret-test", + "script_id": "X", + "youtube_via_relay": false, + "exit_node": { + "enabled": true, + "relay_url": "https://exit.example.com/relay", + "psk": "shared-psk-1234", + "mode": "full" + } + }"#; + let cfg: crate::config::Config = serde_json::from_str(s).unwrap(); + let r = ResolvedRouting::from_config(&cfg, Mode::AppsScript); + assert!( + r.youtube_via_relay_effective, + "exit_node.mode=full must imply youtube_via_relay (88b2767)", + ); + assert!(r.exit_node_full_mode_active); + assert!( + r.relay_url_patterns.is_empty(), + "exit_node.mode=full must NOT prepend default pattern \ + (would bypass exit node for non-/youtubei/ paths): {:?}", + r.relay_url_patterns, + ); + assert!(r.force_mitm_hosts.is_empty()); + } + + #[test] + fn resolved_routing_exit_node_full_in_direct_mode_does_not_imply_yt_relay() { + // exit_node config is shared across modes but only applies to + // apps_script. In direct mode there's no relay → no exit node + // → the OR with exit-node-full must NOT promote + // youtube_via_relay_effective to true (would be misleading). + let s = r#"{ + "mode": "direct", + "exit_node": { + "enabled": true, + "relay_url": "https://exit.example.com/relay", + "psk": "shared-psk-1234", + "mode": "full" + } + }"#; + let cfg: crate::config::Config = serde_json::from_str(s).unwrap(); + let r = ResolvedRouting::from_config(&cfg, Mode::Direct); + assert!(!r.youtube_via_relay_effective); + assert!(!r.exit_node_full_mode_active); + assert!(r.relay_url_patterns.is_empty()); + } + + #[test] + fn resolved_routing_exit_node_selective_does_not_imply_yt_relay() { + // Exit-node `selective` (the default) only sends listed hosts + // through the second hop. YouTube isn't in the typical CF-anti-bot + // list, and the per-path filter is fine to keep — non-`/youtubei/` + // YT paths going via SNI-rewrite forward is the win the filter + // was designed for. + let s = r#"{ + "mode": "apps_script", + "auth_key": "secret-test-secret-test", + "script_id": "X", + "exit_node": { + "enabled": true, + "relay_url": "https://exit.example.com/relay", + "psk": "shared-psk-1234", + "mode": "selective", + "hosts": ["chatgpt.com"] + } + }"#; + let cfg: crate::config::Config = serde_json::from_str(s).unwrap(); + let r = ResolvedRouting::from_config(&cfg, Mode::AppsScript); + assert!(!r.youtube_via_relay_effective); + assert!(!r.exit_node_full_mode_active); + // Default pattern still prepended. + assert_eq!(r.relay_url_patterns, vec!["youtube.com/youtubei/".to_string()]); + } + + #[test] + fn resolved_routing_user_patterns_dedup_against_default() { + // If a user pastes the default pattern verbatim (or with stray + // whitespace / scheme), dedup keeps a single entry. + let s = r#"{ + "mode": "apps_script", + "auth_key": "secret-test-secret-test", + "script_id": "X", + "relay_url_patterns": [ + "https://YouTube.com/YouTubei/", + " example.com/api/ " + ] + }"#; + let cfg: crate::config::Config = serde_json::from_str(s).unwrap(); + let r = ResolvedRouting::from_config(&cfg, Mode::AppsScript); + assert_eq!( + r.relay_url_patterns, + vec![ + "youtube.com/youtubei/".to_string(), + "example.com/api/".to_string(), + ], + ); + } + + #[test] + fn force_mitm_pulls_only_configured_host_and_subdomains() { + // One-directional suffix match: an entry like + // `youtubei.googleapis.com` pulls itself and its subdomains, but + // does NOT pull the parent `googleapis.com` or sibling + // subdomains. Sibling traffic stays on SNI-rewrite. This is a + // regression guard against the original bidirectional-match + // implementation, which pulled parents and made + // `host_in_force_mitm_list` disagree with `matches_sni_rewrite`. + let hosts = std::collections::HashMap::new(); + let force = vec!["youtubei.googleapis.com".to_string()]; + + // Exact force host: pulled. + assert!(!should_use_sni_rewrite( + &hosts, + "youtubei.googleapis.com", + 443, + false, + &force, + )); + // Subdomain of the force host: pulled. + assert!(!should_use_sni_rewrite( + &hosts, + "v1.youtubei.googleapis.com", + 443, + false, + &force, + )); + // Sibling subdomain of the parent: NOT pulled (stays on SNI-rewrite). + assert!(should_use_sni_rewrite( + &hosts, + "drive.googleapis.com", + 443, + false, + &force, + )); + } + + #[test] + fn force_mitm_subdomain_does_not_pull_parent_sni_suffix() { + // Direct test of the asymmetry that motivated dropping the + // bidirectional clause. force=`studio.youtube.com` must NOT + // make `www.youtube.com` or bare `youtube.com` pull out of + // SNI-rewrite — those should still take the SNI-rewrite tunnel + // (matched via the `youtube.com` entry in SNI_REWRITE_SUFFIXES). + // Otherwise the dispatch-side `host_in_force_mitm_list` would + // disagree (no recognition of the parent), and parent-host + // traffic would be force-MITM'd-then-blindly-relayed instead of + // taking the fast SNI tunnel. + let hosts = std::collections::HashMap::new(); + let force = vec!["studio.youtube.com".to_string()]; + + // Configured host pulled. + assert!(!should_use_sni_rewrite( + &hosts, + "studio.youtube.com", + 443, + false, + &force, + )); + // Parent NOT pulled — still SNI-rewrites. + assert!(should_use_sni_rewrite( + &hosts, + "youtube.com", + 443, + false, + &force, + )); + assert!(should_use_sni_rewrite( + &hosts, + "www.youtube.com", + 443, + false, + &force, + )); + // Matchers must agree on membership of the parent. + assert!(!host_in_force_mitm_list("youtube.com", &force)); + assert!(!host_in_force_mitm_list("www.youtube.com", &force)); + } } From 985dc600a6c2ce67b17242382a41e327b4e0e442 Mon Sep 17 00:00:00 2001 From: dazzling-no-more <278675588+dazzling-no-more@users.noreply.github.com> Date: Sun, 10 May 2026 16:05:53 +0400 Subject: [PATCH 2/5] feat(config): sabr_strip kill-switch + non_exhaustive Config --- .../java/com/therealaleph/mhrv/ConfigStore.kt | 39 ++- src/bin/ui.rs | 27 ++- src/config.rs | 77 ++++++ src/domain_fronter.rs | 229 +++++++++++++++--- 4 files changed, 310 insertions(+), 62 deletions(-) diff --git a/android/app/src/main/java/com/therealaleph/mhrv/ConfigStore.kt b/android/app/src/main/java/com/therealaleph/mhrv/ConfigStore.kt index df20839a..d42f6eb9 100644 --- a/android/app/src/main/java/com/therealaleph/mhrv/ConfigStore.kt +++ b/android/app/src/main/java/com/therealaleph/mhrv/ConfigStore.kt @@ -160,27 +160,18 @@ data class MhrvConfig( val youtubeViaRelay: Boolean = false, /** - * Path-pinned relay routing (Rust `relay_url_patterns`, upstream - * commit b3b9220). Each entry is a `host/path-prefix` (no scheme, - * lowercase) — paths matching go through the Apps Script relay, - * non-matching paths on the same host fall through to a direct - * SNI-rewrite HTTP forward (saving Apps Script quota). - * - * The Rust side prepends the default `youtube.com/youtubei/` - * pattern at startup, with two suppression gates: - * - `youtubeViaRelay = true` (full YT through relay → filter is - * redundant) - * - exit-node "full" mode (every URL must route through the - * second-hop exit node → filter would bypass it) - * Plus, when either gate is active, user-supplied patterns whose - * host overlaps `YOUTUBE_RELAY_HOSTS` (youtube.com, youtu.be, - * youtube-nocookie.com, youtubei.googleapis.com) are dropped at - * startup with a warning, since they would partially defeat the - * full-relay contract. - * - * This Android-side field is for *additional* user entries only — - * round-tripped through config.json so a hand-edited extension - * survives a save. No UI editor (power-user knob). + * SABR quality-track strip kill-switch (Rust `sabr_strip`). + * Default true. See `src/config.rs` `sabr_strip` for the trade-off + * and when to flip — Android-side is just round-trip plumbing. + */ + val sabrStrip: Boolean = true, + + /** + * Path-pinned relay routing (Rust `relay_url_patterns`). + * See `src/config.rs` `relay_url_patterns` for the full semantics — + * suppression gates, default pattern, host-overlap rules. This + * Android-side field is for *additional* user entries only, + * round-tripped so a hand-edited list survives Save. */ val relayUrlPatterns: List = emptyList(), @@ -265,6 +256,10 @@ data class MhrvConfig( put("tunnel_doh", tunnelDoh) put("block_doh", blockDoh) if (youtubeViaRelay) put("youtube_via_relay", true) + // sabr_strip default is true on the Rust side; emit only + // when the user has explicitly disabled it so unchanged + // configs stay clean. #977 kill-switch. + if (!sabrStrip) put("sabr_strip", false) // Trim/drop-empty/dedupe before serializing — same pattern // as bypass_doh_hosts. Skip the key entirely when the user // hasn't added any extras so we don't leak an empty array @@ -385,6 +380,7 @@ object ConfigStore { if (cfg.tunnelDoh != defaults.tunnelDoh) obj.put("tunnel_doh", cfg.tunnelDoh) if (cfg.blockDoh != defaults.blockDoh) obj.put("block_doh", cfg.blockDoh) if (cfg.youtubeViaRelay != defaults.youtubeViaRelay) obj.put("youtube_via_relay", cfg.youtubeViaRelay) + if (cfg.sabrStrip != defaults.sabrStrip) obj.put("sabr_strip", cfg.sabrStrip) val cleanBypassDohHosts = cfg.bypassDohHosts .map { it.trim() } .filter { it.isNotEmpty() } @@ -499,6 +495,7 @@ object ConfigStore { tunnelDoh = obj.optBoolean("tunnel_doh", true), blockDoh = obj.optBoolean("block_doh", true), youtubeViaRelay = obj.optBoolean("youtube_via_relay", false), + sabrStrip = obj.optBoolean("sabr_strip", true), bypassDohHosts = obj.optJSONArray("bypass_doh_hosts")?.let { arr -> buildList { for (i in 0 until arr.length()) add(arr.optString(i)) } }?.filter { it.isNotBlank() }.orEmpty(), diff --git a/src/bin/ui.rs b/src/bin/ui.rs index c596ada7..b0e50177 100644 --- a/src/bin/ui.rs +++ b/src/bin/ui.rs @@ -251,9 +251,12 @@ struct FormState { google_ip_validation: bool, normalize_x_graphql: bool, youtube_via_relay: bool, - /// Round-tripped from config.json. No UI control yet — power-user - /// edit. See config.rs `relay_url_patterns` (b3b9220). + /// See `config::Config::relay_url_patterns` for semantics + defaults. + /// No UI control; round-tripped so a hand-edited list survives Save. relay_url_patterns: Vec, + /// See `config::Config::sabr_strip` for trade-off + when to flip. + /// No UI control; round-tripped so a hand-edited `false` survives Save. + sabr_strip: bool, passthrough_hosts: Vec, /// Round-tripped from config.json so the UI's save path doesn't /// drop the user's setting. Not currently exposed as a UI control; @@ -389,6 +392,7 @@ fn load_form() -> (FormState, Option) { normalize_x_graphql: c.normalize_x_graphql, youtube_via_relay: c.youtube_via_relay, relay_url_patterns: c.relay_url_patterns.clone(), + sabr_strip: c.sabr_strip, passthrough_hosts: c.passthrough_hosts.clone(), block_quic: c.block_quic, disable_padding: c.disable_padding, @@ -429,6 +433,7 @@ fn load_form() -> (FormState, Option) { normalize_x_graphql: false, youtube_via_relay: false, relay_url_patterns: Vec::new(), + sabr_strip: true, passthrough_hosts: Vec::new(), block_quic: true, disable_padding: false, @@ -585,11 +590,10 @@ impl FormState { // config-only flag for now. Passed through from the loaded // config if set, otherwise defaults to false. youtube_via_relay: self.youtube_via_relay, - // relay_url_patterns is config-only too (b3b9220). No UI - // editor yet — the default `youtube.com/youtubei/` ships - // automatically; round-trip preserves any extras the user - // added by hand. + // Config-only round-trips. Source of truth for both fields + // is `config::Config` (defaults, gating, trade-offs). relay_url_patterns: self.relay_url_patterns.clone(), + sabr_strip: self.sabr_strip, // Similarly config-only for now; round-trips through the // file so the UI doesn't drop the user's entries on save. passthrough_hosts: self.passthrough_hosts.clone(), @@ -676,12 +680,14 @@ struct ConfigWire<'a> { normalize_x_graphql: bool, #[serde(skip_serializing_if = "is_false")] youtube_via_relay: bool, - /// Path-prefix relay routing (b3b9220). Default is empty — the - /// built-in `youtube.com/youtubei/` is added at proxy startup, not - /// written into config.json — so configs stay clean unless the user - /// added their own extras. + /// See `config::Config::relay_url_patterns`. Skipped when empty so + /// the proxy-applied default isn't echoed into config.json. #[serde(skip_serializing_if = "Vec::is_empty")] relay_url_patterns: &'a Vec, + /// See `config::Config::sabr_strip`. Default `true`; emitted only + /// when explicitly disabled so unchanged configs stay clean. + #[serde(skip_serializing_if = "is_true")] + sabr_strip: bool, #[serde(skip_serializing_if = "Vec::is_empty")] passthrough_hosts: &'a Vec, // IP-scan knobs. These used to be missing from the wire struct, so @@ -790,6 +796,7 @@ impl<'a> From<&'a Config> for ConfigWire<'a> { normalize_x_graphql: c.normalize_x_graphql, youtube_via_relay: c.youtube_via_relay, relay_url_patterns: &c.relay_url_patterns, + sabr_strip: c.sabr_strip, passthrough_hosts: &c.passthrough_hosts, fetch_ips_from_api: c.fetch_ips_from_api, max_ips_to_scan: c.max_ips_to_scan, diff --git a/src/config.rs b/src/config.rs index 604c12e0..c0bd5480 100644 --- a/src/config.rs +++ b/src/config.rs @@ -57,7 +57,18 @@ impl ScriptId { } } +/// Top-level config schema. Marked `#[non_exhaustive]` so adding a new +/// field (which the YouTube + DPI work has done several times in the +/// last few releases) isn't a breaking change for downstream Rust +/// consumers depending on `mhrv_rs` as a library — they'll be forced +/// to use functional-update or `serde_json::from_str` rather than a +/// full struct literal, which means a new field added later doesn't +/// fail their build until they choose to surface it. +/// +/// In-crate code can still construct via struct literal as usual; +/// `#[non_exhaustive]` only restricts cross-crate construction. #[derive(Debug, Clone, Deserialize)] +#[non_exhaustive] pub struct Config { pub mode: String, #[serde(default = "default_google_ip")] @@ -199,6 +210,32 @@ pub struct Config { #[serde(default)] pub relay_url_patterns: Vec, + /// Strip SABR quality-track entries (top-level field-3 of the + /// segment-fetch protobuf) from `/videoplayback` POST bodies on + /// `*.googlevideo.com` / `*.youtube.com`. Default `true` — this is + /// the upstream-parity behaviour and the fix for "Response too + /// large" 502s on multi-track segment fetches that exceed Apps + /// Script `UrlFetchApp`'s ~10 MB cap (commits 9b6d03e + 33db28a + /// from upstream Python). + /// + /// **Why this kill-switch exists** (#977 testing report from + /// `unacoder`, May 2026): under `apps_script` mode with + /// `youtube_via_relay = false`, forcing single-quality-track + /// responses can interact poorly with playback at faster-than-1× + /// speeds (1.7×–2×). Each chunk represents less buffer-ahead + /// duration than a multi-track bundle would, and at speed-up the + /// player drains the buffer faster than the next chunk arrives — + /// reported as "only one videoplayback request is buffered." + /// + /// Flip to `false` if you hit that regression. The trade-off is + /// that long-form videos may then 502 on segments where Google + /// would have bundled multiple quality tracks; the player handles + /// that by falling back to a lower quality. The pre-port behaviour + /// (no SABR strip) was tolerable for most users — the strip is a + /// quality-of-life fix for the specific bundling-blowup case. + #[serde(default = "default_sabr_strip")] + pub sabr_strip: bool, + /// User-configurable passthrough list. Any host whose name matches /// one of these entries bypasses the Apps Script relay entirely and /// is plain-TCP-passthroughed (optionally through `upstream_socks5`). @@ -430,7 +467,13 @@ pub struct Config { } /// Configuration for the optional second-hop exit node. +/// +/// Same `#[non_exhaustive]` rationale as `Config` — the exit-node +/// feature is still maturing (host lists, mode strings, retry knobs are +/// all places future fields are likely to land), so cross-crate +/// consumers should round-trip through serde rather than struct literal. #[derive(Debug, Clone, Default, Deserialize, Serialize)] +#[non_exhaustive] pub struct ExitNodeConfig { /// Master switch. Default false. Even with `relay_url` and `psk` /// set, nothing routes through the exit node unless this is true. @@ -559,6 +602,12 @@ fn default_auto_blacklist_cooldown_secs() -> u64 { 120 } /// hard-coded `BATCH_TIMEOUT` and Apps Script's typical response cliff. fn default_request_timeout_secs() -> u64 { 30 } +/// Default for `sabr_strip`: `true`. Matches upstream-parity behaviour +/// and the headline "no more 10 MB UrlFetchApp 502s on multi-track +/// segment fetches" win. Users hitting the speed-up-playback buffering +/// regression (#977) can opt out with `sabr_strip: false`. +fn default_sabr_strip() -> bool { true } + fn default_google_ip() -> String { "216.239.38.120".into() } @@ -1116,6 +1165,34 @@ mod tests { ); } + #[test] + fn sabr_strip_defaults_to_true_when_omitted() { + // Existing configs from before the kill-switch landed don't + // have the field. `serde(default = "default_sabr_strip")` must + // resolve to true so the upstream-parity behaviour is what + // unchanged users get. + let s = r#"{ + "mode": "apps_script", + "auth_key": "secret-test-secret-test", + "script_id": "X" + }"#; + let cfg: Config = serde_json::from_str(s).unwrap(); + assert!(cfg.sabr_strip, "default must be true (strip enabled)"); + } + + #[test] + fn sabr_strip_round_trips_explicit_false() { + // Users hitting the #977 buffering regression flip this to false. + let s = r#"{ + "mode": "apps_script", + "auth_key": "secret-test-secret-test", + "script_id": "X", + "sabr_strip": false + }"#; + let cfg: Config = serde_json::from_str(s).unwrap(); + assert!(!cfg.sabr_strip, "explicit false must round-trip"); + } + #[test] fn config_validate_accepts_well_formed_relay_patterns() { // Mixed canonical / scheme-prefixed / bare-host entries all pass. diff --git a/src/domain_fronter.rs b/src/domain_fronter.rs index a80afd27..737b8e38 100644 --- a/src/domain_fronter.rs +++ b/src/domain_fronter.rs @@ -359,6 +359,12 @@ pub struct DomainFronter { /// Pre-normalized (lowercased, leading-dot stripped) host list for /// fast O(N) match in `exit_node_matches`. exit_node_hosts: Vec, + /// Strip SABR quality-track entries from `/videoplayback` POST + /// bodies. Mirrors `Config::sabr_strip`. Default `true`. Kill-switch + /// for users who hit the speed-up-playback buffering regression + /// reported in #977. See config.rs `sabr_strip` for the full + /// trade-off. + sabr_strip: bool, } /// Aggregated stats for one remote host. @@ -571,6 +577,7 @@ impl DomainFronter { .map(|h| h.trim().trim_start_matches('.').to_ascii_lowercase()) .filter(|h| !h.is_empty()) .collect(), + sabr_strip: config.sabr_strip, }) } @@ -606,6 +613,57 @@ impl DomainFronter { self.batch_timeout } + /// Return `Some(stripped_body)` when the SABR quality-track strip + /// applies to this request and actually removed bytes; `None` when + /// the original body should pass through untouched. + /// + /// Six gates, all of which must pass for the strip to fire: + /// 1. `Config::sabr_strip` is on (kill-switch from #977 testing — + /// see `src/config.rs` for the full trade-off). + /// 2. POST method (segment fetches are POSTs in YouTube's SABR + /// protocol; GETs and other methods don't carry the protobuf). + /// 3. Non-empty body (no body to walk → nothing to strip). + /// 4. URL path contains `/videoplayback` (the SABR endpoint). + /// 5. URL host is in `url_host_is_youtube_video_endpoint`'s set + /// (`*.googlevideo.com` or `*.youtube.com`) — defends against + /// an unrelated service that happens to expose the same path + /// shape with a protobuf-like body. + /// 6. The strip itself actually removed at least one byte — + /// session-init bodies (no field-2) and bodies without any + /// field-3 entries return unchanged from + /// `strip_sabr_quality_tracks` and we report `None` so the + /// caller skips the allocation churn. + /// + /// Pulled out of `relay()` so the gate can be exercised by unit + /// tests without standing up a live Apps Script connection — see + /// `sabr_strip_off_keeps_body_unchanged` / + /// `sabr_strip_on_strips_segment_fetch_body`. + pub(crate) fn maybe_strip_sabr_body( + &self, + method: &str, + url: &str, + body: &[u8], + ) -> Option> { + if !self.sabr_strip + || method != "POST" + || body.is_empty() + || !url.contains("/videoplayback") + || !url_host_is_youtube_video_endpoint(url) + { + return None; + } + let stripped = strip_sabr_quality_tracks(body); + if stripped.len() == body.len() { + return None; + } + tracing::debug!( + "SABR strip: removed {} quality-track bytes from {}", + body.len() - stripped.len(), + url.split('?').next().unwrap_or(url), + ); + Some(stripped) + } + /// Record one relay call toward the daily budget. Called once per /// outbound Apps Script fetch. Rolls over both daily counters at /// 00:00 Pacific Time, matching Apps Script's quota reset cadence @@ -1645,38 +1703,17 @@ impl DomainFronter { url }; - // SABR quality-track strip for YouTube `/videoplayback` POST bodies. - // Has to run before the exit-node short-circuit so the exit-node - // path also benefits — same quota-relief reason as the relay path - // (Apps Script's UrlFetchApp + the exit node both cap at ~10 MB). - // Pure transform on body bytes; no-op on non-SABR payloads. - // - // Host-gated to YouTube's videoplayback hosts so an unrelated - // service that happens to expose a `/videoplayback` endpoint - // serving a protobuf-shaped POST body (with top-level fields 2 - // and 3) doesn't get its field-3 entries silently rewritten. - // YouTube's videoplayback URLs live on either `*.googlevideo.com` - // (the chunk CDN) or `youtube.com` itself; both are gated. + // SABR quality-track strip — applied before the exit-node + // short-circuit so both paths benefit. The full decision + + // trade-off is in `maybe_strip_sabr_body` so the gate can be + // unit-tested without invoking the network-coupled relay path. let stripped_body; - let body: &[u8] = if method == "POST" - && !body.is_empty() - && url.contains("/videoplayback") - && url_host_is_youtube_video_endpoint(url) - { - let s = strip_sabr_quality_tracks(body); - if s.len() != body.len() { - tracing::debug!( - "SABR strip: removed {} quality-track bytes from {}", - body.len() - s.len(), - url.split('?').next().unwrap_or(url), - ); - stripped_body = s; + let body: &[u8] = match self.maybe_strip_sabr_body(method, url, body) { + None => body, + Some(stripped) => { + stripped_body = stripped; stripped_body.as_slice() - } else { - body } - } else { - body }; // Exit-node short-circuit: route through the configured second-hop @@ -5284,6 +5321,14 @@ hello"; /// `verify_ssl=true` and a placeholder `google_ip` are fine because /// `DomainFronter::new` doesn't touch the network. fn fronter_for_test(force_http1: bool) -> DomainFronter { + fronter_for_test_with(force_http1, true) + } + + /// `fronter_for_test` plus a `sabr_strip` knob for the SABR + /// kill-switch gate tests. Lets the tests prove the runtime + /// behaviour at the `relay()` strip-decision branch — not just + /// the config-default round-trip. + fn fronter_for_test_with(force_http1: bool, sabr_strip: bool) -> DomainFronter { let json = format!( r#"{{ "mode": "apps_script", @@ -5295,9 +5340,10 @@ hello"; "listen_port": 8085, "log_level": "info", "verify_ssl": true, - "force_http1": {} + "force_http1": {}, + "sabr_strip": {} }}"#, - force_http1 + force_http1, sabr_strip ); let cfg: Config = serde_json::from_str(&json).unwrap(); DomainFronter::new(&cfg).expect("test fronter must construct") @@ -6124,6 +6170,127 @@ hello"; // ── SABR host gate (defense against unrelated /videoplayback) ──────── + // ── SABR kill-switch runtime gate (#977) ───────────────────────────── + + /// Build a known segment-fetch body (field-2 + field-3) that the + /// strip would actually shrink — used to prove the gate at runtime + /// rather than just the config-default round-trip. + fn segment_fetch_body() -> Vec { + let mut body: Vec = Vec::new(); + enc_length_delim(&mut body, 2, b"range-descriptor"); + enc_length_delim(&mut body, 3, b"quality-track-selector"); + body + } + + #[test] + fn sabr_strip_on_strips_segment_fetch_body_via_relay_gate() { + // sabr_strip = true (default): segment-fetch POST to a real + // googlevideo URL is stripped. This protects the main behaviour + // the kill-switch gates — if a future refactor accidentally + // drops the `self.sabr_strip` check from `relay()`, the strip + // would still apply on `true` and the test would pass; if the + // refactor accidentally INVERTS the check, this test fails + // because no bytes are removed. + let fronter = fronter_for_test_with(false, true); + let body = segment_fetch_body(); + let result = fronter.maybe_strip_sabr_body( + "POST", + "https://rrx---sn-xxx.googlevideo.com/videoplayback?id=42", + &body, + ); + let stripped = result.expect("sabr_strip=true must strip a segment-fetch body"); + assert!( + stripped.len() < body.len(), + "strip must remove at least one byte ({} -> {})", + body.len(), + stripped.len(), + ); + // And the field-3 entry specifically is what was removed. + assert!( + !stripped.windows(b"quality-track-selector".len()) + .any(|w| w == b"quality-track-selector"), + "field-3 payload must be gone from stripped body", + ); + } + + #[test] + fn sabr_strip_off_keeps_body_unchanged_via_relay_gate() { + // sabr_strip = false: same body, same URL, gate must report + // None so `relay()` passes the body through verbatim. This is + // the regression test for the #977 kill-switch — if someone + // removes the `self.sabr_strip` check, this test fails. + let fronter = fronter_for_test_with(false, false); + let body = segment_fetch_body(); + let result = fronter.maybe_strip_sabr_body( + "POST", + "https://rrx---sn-xxx.googlevideo.com/videoplayback?id=42", + &body, + ); + assert!( + result.is_none(), + "sabr_strip=false must report None (no transformation): got {:?}", + result, + ); + } + + #[test] + fn sabr_strip_gate_respects_method_and_url_even_when_flag_is_on() { + // Other gates: only POST + /videoplayback + YT-host triggers. + // This protects the host-gate / method-gate / path-gate work + // from a refactor that conflates the kill-switch with the rest. + let fronter = fronter_for_test_with(false, true); + let body = segment_fetch_body(); + + // GET on the right URL: gate off (not POST). + assert!(fronter + .maybe_strip_sabr_body( + "GET", + "https://rrx---sn-xxx.googlevideo.com/videoplayback", + &body, + ) + .is_none()); + + // POST on a non-YT host: gate off (host gate). + assert!(fronter + .maybe_strip_sabr_body( + "POST", + "https://api.example.com/videoplayback", + &body, + ) + .is_none()); + + // POST on YT host without /videoplayback path: gate off. + assert!(fronter + .maybe_strip_sabr_body( + "POST", + "https://www.youtube.com/youtubei/v1/player", + &body, + ) + .is_none()); + } + + #[test] + fn sabr_strip_gate_returns_none_when_strip_is_a_no_op() { + // Body without field-3 (or without field-2) survives + // `strip_sabr_quality_tracks` unchanged. The gate then reports + // None so the caller doesn't pay for a redundant clone of an + // unmodified buffer. + let fronter = fronter_for_test_with(false, true); + let mut session_init: Vec = Vec::new(); + // field-5 + field-3, no field-2 → strip is a no-op. + enc_length_delim(&mut session_init, 5, b"session-context"); + enc_length_delim(&mut session_init, 3, b"essential-metadata"); + let result = fronter.maybe_strip_sabr_body( + "POST", + "https://rrx---sn-xxx.googlevideo.com/videoplayback", + &session_init, + ); + assert!( + result.is_none(), + "no-op strip must report None to avoid redundant alloc" + ); + } + #[test] fn sabr_host_gate_recognises_youtube_video_endpoints() { // YouTube chunk CDN and yt itself. From 9e7ddcb402ae9aa38d34330daf5327b33af26ac0 Mon Sep 17 00:00:00 2001 From: dazzling-no-more <278675588+dazzling-no-more@users.noreply.github.com> Date: Sun, 10 May 2026 16:38:32 +0400 Subject: [PATCH 3/5] feat(stats): yt_forwarder counters + tracing target --- .../main/java/com/therealaleph/mhrv/Native.kt | 18 +- src/bin/ui.rs | 18 +- src/domain_fronter.rs | 228 +++++++++++++++++- src/proxy_server.rs | 37 ++- 4 files changed, 293 insertions(+), 8 deletions(-) diff --git a/android/app/src/main/java/com/therealaleph/mhrv/Native.kt b/android/app/src/main/java/com/therealaleph/mhrv/Native.kt index fbf44dc5..a72a6940 100644 --- a/android/app/src/main/java/com/therealaleph/mhrv/Native.kt +++ b/android/app/src/main/java/com/therealaleph/mhrv/Native.kt @@ -104,7 +104,23 @@ object Native { * points scope as h2_calls. Compute h2 health as * h2_calls / (h2_calls + h2_fallbacks)), * h2_disabled (boolean: true when h2 fast path is permanently - * off — config force_http1 set, or peer refused h2 via ALPN) + * off — config force_http1 set, or peer refused h2 via ALPN), + * forwarder_calls (successful upstream fetches via the + * SNI-rewrite forwarder — fast path for non-/youtubei/ + * paths on `force_mitm_hosts`. Counted at upstream-success, + * before the downstream write to the browser, so a client + * disconnect mid-write still counts. Zero in non-AppsScript + * modes / when no `relay_url_patterns` host is in play), + * forwarder_bytes (response bytes successfully fetched by the + * forwarder; same upstream-fetch-success semantic as + * forwarder_calls), + * forwarder_errors (forwarder dispatch errors — connect failure, + * TLS error, read timeout, response cap exceeded. Distinct + * from relay_failures: this counts fast-path-only misses + * regardless of whether the relay-fallback then recovered the + * request. Combine with relay_failures to distinguish "fast + * path missed but request served" from "request failed + * end-to-end") * * Cheap — just reads atomics. Safe to poll on a second-scale timer. */ diff --git a/src/bin/ui.rs b/src/bin/ui.rs index b0e50177..85626a3c 100644 --- a/src/bin/ui.rs +++ b/src/bin/ui.rs @@ -1338,7 +1338,7 @@ impl eframe::App for App { if let Some(s) = &stats { // Compact two-column layout so 7 metrics fit in ~4 rows // instead of a tall vertical strip. - let rows: Vec<(&str, String)> = vec![ + let mut rows: Vec<(&str, String)> = vec![ ("relay calls", s.relay_calls.to_string()), ("failures", s.relay_failures.to_string()), ("coalesced", s.coalesced.to_string()), @@ -1362,6 +1362,22 @@ impl eframe::App for App { ), ), ]; + // Forwarder rows only appear once the path filter + // has fired at least once — otherwise the typical + // (no-pattern-hit / non-AppsScript) user sees an + // empty pair of "0" rows that adds noise without + // signal. `err` is fast-path-miss count; combine + // with `relay_failures` to gauge end-to-end health. + if s.forwarder_calls + s.forwarder_errors > 0 { + rows.push(( + "fwd calls", + format!( + "{} (err {})", + s.forwarder_calls, s.forwarder_errors + ), + )); + rows.push(("fwd bytes", fmt_bytes(s.forwarder_bytes))); + } egui::Grid::new("stats") .num_columns(4) .spacing([16.0, 4.0]) diff --git a/src/domain_fronter.rs b/src/domain_fronter.rs index 737b8e38..2c1b7700 100644 --- a/src/domain_fronter.rs +++ b/src/domain_fronter.rs @@ -311,6 +311,28 @@ pub struct DomainFronter { /// h2_fallbacks)` ratio indicates an unhealthy h2 conn or a flaky /// middlebox eating h2 frames; consider `force_http1: true`. h2_fallbacks: AtomicU64, + /// Successful SNI-rewrite HTTP forwarder calls (the b3b9220 path — + /// non-`/youtubei/` paths on `force_mitm_hosts` taking the direct + /// SNI-rewrite TLS path instead of burning Apps Script quota). + /// Counts upstream-fetch success: incremented as soon as the + /// forwarder has the response bytes in hand, BEFORE writing to the + /// browser. A client-disconnect during the downstream write still + /// counts — the path filter did upstream work, that's the metric. + /// Useful for diagnosing reports like #977: a high + /// `forwarder_calls / (forwarder_calls + relay_calls)` ratio means + /// the path filter is doing its job; a near-zero ratio means it's + /// inert (and any reported regression isn't from the forwarder). + forwarder_calls: AtomicU64, + /// Response bytes successfully fetched by the forwarder from the + /// upstream (Google edge). Same upstream-fetch-success semantic as + /// `forwarder_calls`. + forwarder_bytes: AtomicU64, + /// Forwarder dispatch errors — connect failure, TLS error, read + /// timeout, response cap exceeded, or upstream EOF before any + /// bytes. Counts the forwarder fast-path miss; says nothing about + /// whether the subsequent relay-path fallback recovered the + /// request. Use `relay_failures` for request-failure counting. + forwarder_errors: AtomicU64, /// Per-host breakdown of traffic going through this fronter. Keyed by /// the host of the URL (e.g. "api.x.com"). Read-mostly; only touched /// on the slow path (once per relayed request), so a plain Mutex is @@ -542,6 +564,9 @@ impl DomainFronter { bytes_relayed: AtomicU64::new(0), h2_calls: AtomicU64::new(0), h2_fallbacks: AtomicU64::new(0), + forwarder_calls: AtomicU64::new(0), + forwarder_bytes: AtomicU64::new(0), + forwarder_errors: AtomicU64::new(0), per_site: Arc::new(std::sync::Mutex::new(HashMap::new())), today_calls: AtomicU64::new(0), today_bytes: AtomicU64::new(0), @@ -613,6 +638,27 @@ impl DomainFronter { self.batch_timeout } + /// Record a successful upstream fetch by the SNI-rewrite forwarder. + /// `bytes` is the response size received from the Google edge. + /// Counted at the point of upstream success, BEFORE the proxy + /// writes the bytes to the browser — a client disconnect mid-write + /// still leaves the metric accurate (the path filter did its + /// work). Called by `proxy_server::handle_mitm_request`. + pub(crate) fn record_forwarder_call(&self, bytes: u64) { + self.forwarder_calls.fetch_add(1, Ordering::Relaxed); + self.forwarder_bytes.fetch_add(bytes, Ordering::Relaxed); + } + + /// Record a forwarder dispatch error (connect failure, TLS error, + /// read timeout, cap exceeded, ...). Says nothing about whether + /// the subsequent relay-path fallback recovered the request — use + /// `relay_failures` for that. The two metrics together let + /// diagnostics distinguish "fast path missed but request still + /// served" from "request failed end-to-end." + pub(crate) fn record_forwarder_error(&self) { + self.forwarder_errors.fetch_add(1, Ordering::Relaxed); + } + /// Return `Some(stripped_body)` when the SABR quality-track strip /// applies to this request and actually removed bytes; `None` when /// the original body should pass through untouched. @@ -745,6 +791,9 @@ impl DomainFronter { h2_calls: self.h2_calls.load(Ordering::Relaxed), h2_fallbacks: self.h2_fallbacks.load(Ordering::Relaxed), h2_disabled: self.h2_disabled.load(Ordering::Relaxed), + forwarder_calls: self.forwarder_calls.load(Ordering::Relaxed), + forwarder_bytes: self.forwarder_bytes.load(Ordering::Relaxed), + forwarder_errors: self.forwarder_errors.load(Ordering::Relaxed), } } @@ -4420,6 +4469,27 @@ pub struct StatsSnapshot { /// switch set, or peer refused h2 during ALPN). All traffic on the /// h1 path. pub h2_disabled: bool, + /// Successful upstream fetches by the SNI-rewrite forwarder + /// (b3b9220 fast path — non-`/youtubei/` paths on + /// `force_mitm_hosts`). Counted at upstream success, before the + /// downstream write to the browser, so a client-disconnect during + /// write still counts as "the path filter did upstream work." + /// Useful for diagnosing whether the path filter is firing as + /// expected; a near-zero ratio against `relay_calls` means the + /// forwarder is inert and any reported regression is elsewhere. + pub forwarder_calls: u64, + /// Response bytes successfully fetched by the forwarder from the + /// upstream. Same upstream-fetch-success semantic as + /// `forwarder_calls`. + pub forwarder_bytes: u64, + /// Forwarder dispatch errors (connect failure, TLS error, read + /// timeout, cap exceeded). Distinct from `relay_failures` — + /// `relay_failures` counts request-level failures, this counts + /// fast-path-only misses regardless of whether the relay-fallback + /// then recovered the request. Combine the two to distinguish + /// "fast path missed but request served" from "request failed + /// end-to-end". + pub forwarder_errors: u64, } impl StatsSnapshot { @@ -4452,8 +4522,24 @@ impl StatsSnapshot { ) } }; + // Forwarder segment is only emitted when the path filter has + // actually fired — keeps the line clean for the typical + // (non-AppsScript / no-pattern-hit) case. `err` is the + // fast-path miss count (regardless of whether relay-fallback + // recovered the request); `relay_failures` covers actual + // end-to-end request failures. + let fwd_seg = if self.forwarder_calls + self.forwarder_errors == 0 { + String::new() + } else { + format!( + " fwd={} ({}KB) err={}", + self.forwarder_calls, + self.forwarder_bytes / 1024, + self.forwarder_errors, + ) + }; format!( - "stats: relay={} ({}KB) failures={} coalesced={} cache={}/{} ({:.0}% hit, {}KB) scripts={}/{} active{}", + "stats: relay={} ({}KB) failures={} coalesced={} cache={}/{} ({:.0}% hit, {}KB) scripts={}/{} active{}{}", self.relay_calls, self.bytes_relayed / 1024, self.relay_failures, @@ -4465,6 +4551,7 @@ impl StatsSnapshot { self.total_scripts - self.blacklisted_scripts, self.total_scripts, h2_seg, + fwd_seg, ) } @@ -4477,7 +4564,7 @@ impl StatsSnapshot { s.replace('\\', "\\\\").replace('"', "\\\"") } format!( - r#"{{"relay_calls":{},"relay_failures":{},"coalesced":{},"bytes_relayed":{},"cache_hits":{},"cache_misses":{},"cache_bytes":{},"blacklisted_scripts":{},"total_scripts":{},"today_calls":{},"today_bytes":{},"today_key":"{}","today_reset_secs":{},"h2_calls":{},"h2_fallbacks":{},"h2_disabled":{}}}"#, + r#"{{"relay_calls":{},"relay_failures":{},"coalesced":{},"bytes_relayed":{},"cache_hits":{},"cache_misses":{},"cache_bytes":{},"blacklisted_scripts":{},"total_scripts":{},"today_calls":{},"today_bytes":{},"today_key":"{}","today_reset_secs":{},"h2_calls":{},"h2_fallbacks":{},"h2_disabled":{},"forwarder_calls":{},"forwarder_bytes":{},"forwarder_errors":{}}}"#, self.relay_calls, self.relay_failures, self.coalesced, @@ -4494,6 +4581,9 @@ impl StatsSnapshot { self.h2_calls, self.h2_fallbacks, self.h2_disabled, + self.forwarder_calls, + self.forwarder_bytes, + self.forwarder_errors, ) } } @@ -6348,4 +6438,138 @@ hello"; expected.push(0x80); assert_eq!(strip_sabr_quality_tracks(&body), expected); } + + // ── StatsSnapshot::fmt_line + to_json (forwarder fields) ──────────── + + /// Build a `StatsSnapshot` fixture for serialization tests. + /// All non-forwarder fields take fixed sentinels; the three + /// forwarder fields are caller-supplied so each test can target + /// the zero / non-zero branches. + fn snapshot_with_forwarder( + forwarder_calls: u64, + forwarder_bytes: u64, + forwarder_errors: u64, + ) -> StatsSnapshot { + StatsSnapshot { + relay_calls: 100, + relay_failures: 2, + coalesced: 5, + bytes_relayed: 4096, + cache_hits: 30, + cache_misses: 70, + cache_bytes: 8192, + blacklisted_scripts: 0, + total_scripts: 1, + today_calls: 100, + today_bytes: 4096, + today_key: "2026-05-10".into(), + today_reset_secs: 3600, + h2_calls: 80, + h2_fallbacks: 4, + h2_disabled: false, + forwarder_calls, + forwarder_bytes, + forwarder_errors, + } + } + + #[test] + fn fmt_line_omits_forwarder_segment_when_zero() { + // Path filter never fired → forwarder values all zero. The + // CLI line must NOT carry an `fwd=0 err=0` segment, otherwise + // every non-AppsScript / no-pattern-hit user sees a confusing + // always-zero pair. + let s = snapshot_with_forwarder(0, 0, 0); + let line = s.fmt_line(); + assert!( + !line.contains("fwd="), + "fmt_line must omit forwarder segment when all-zero: {}", + line + ); + assert!( + !line.contains("err="), + "fmt_line must omit forwarder error count when all-zero: {}", + line + ); + } + + #[test] + fn fmt_line_includes_forwarder_segment_when_nonzero() { + // Once the path filter has fired (any of calls / errors > 0), + // the diagnostic segment shows up. Bytes are converted to KB + // (mirrors `bytes_relayed`). + let s = snapshot_with_forwarder(42, 1_048_576, 3); + let line = s.fmt_line(); + assert!(line.contains("fwd=42"), "fmt_line missing fwd=42: {}", line); + // 1_048_576 / 1024 = 1024 KB + assert!( + line.contains("(1024KB)"), + "fmt_line missing bytes segment: {}", + line + ); + assert!(line.contains("err=3"), "fmt_line missing err=3: {}", line); + } + + #[test] + fn fmt_line_includes_forwarder_segment_when_only_errors_nonzero() { + // Edge: forwarder consistently failed → calls=0, errors > 0. + // Segment must still appear so users see the failure rate; + // otherwise a fully-broken fast path looks identical to one + // that's never been triggered. + let s = snapshot_with_forwarder(0, 0, 5); + let line = s.fmt_line(); + assert!(line.contains("fwd=0"), "fmt_line missing fwd=0: {}", line); + assert!(line.contains("err=5"), "fmt_line missing err=5: {}", line); + } + + #[test] + fn to_json_emits_forwarder_fields_with_zero_values() { + // Hand-rolled to_json must include the new fields even when + // zero — Android JNI consumers expect a stable schema and + // shouldn't have to handle "missing field" branches per + // version. Parse the output as JSON to also validate the + // hand-rolled format string is syntactically correct. + let s = snapshot_with_forwarder(0, 0, 0); + let json = s.to_json(); + let parsed: serde_json::Value = + serde_json::from_str(&json).expect("to_json must produce valid JSON"); + assert_eq!(parsed["forwarder_calls"], 0); + assert_eq!(parsed["forwarder_bytes"], 0); + assert_eq!(parsed["forwarder_errors"], 0); + // No `forwarder_fallbacks` key — that was the pre-rename name + // and shipping it would confuse JNI consumers parsing both. + assert!( + parsed.get("forwarder_fallbacks").is_none(), + "stale field name must not appear: {}", + json + ); + } + + #[test] + fn to_json_emits_forwarder_fields_with_nonzero_values() { + let s = snapshot_with_forwarder(42, 1_048_576, 3); + let json = s.to_json(); + let parsed: serde_json::Value = + serde_json::from_str(&json).expect("to_json must produce valid JSON"); + assert_eq!(parsed["forwarder_calls"], 42); + assert_eq!(parsed["forwarder_bytes"], 1_048_576); + assert_eq!(parsed["forwarder_errors"], 3); + } + + #[test] + fn to_json_round_trips_existing_fields_alongside_new_ones() { + // Regression guard for the hand-rolled format string: adding + // the new forwarder fields must not have broken any of the + // preexisting fields. Pick a sample of each to confirm. + let s = snapshot_with_forwarder(7, 1024, 0); + let json = s.to_json(); + let parsed: serde_json::Value = + serde_json::from_str(&json).expect("to_json must produce valid JSON"); + assert_eq!(parsed["relay_calls"], 100); + assert_eq!(parsed["bytes_relayed"], 4096); + assert_eq!(parsed["h2_calls"], 80); + assert_eq!(parsed["h2_disabled"], false); + assert_eq!(parsed["today_key"], "2026-05-10"); + assert_eq!(parsed["forwarder_calls"], 7); + } } diff --git a/src/proxy_server.rs b/src/proxy_server.rs index 81568522..bd8a1a53 100644 --- a/src/proxy_server.rs +++ b/src/proxy_server.rs @@ -3189,7 +3189,13 @@ where && host_in_force_mitm_list(host, &rewrite_ctx.force_mitm_hosts) && !url_matches_relay_pattern(&url, &rewrite_ctx.relay_url_patterns) { - tracing::info!("sni-rewrite forward {} {}", method, url); + // All forwarder log lines use `target = "yt_forwarder"` so users + // diagnosing #977-style reports can `RUST_LOG=yt_forwarder=info` + // (or =debug) and see exactly which requests took the fast path, + // their sizes, and their latencies — without grepping the + // general-relay info-spam. + tracing::info!(target: "yt_forwarder", "dispatch {} {}", method, url); + let t0 = std::time::Instant::now(); match forward_via_sni_rewrite_http( &method, host, @@ -3202,6 +3208,21 @@ where .await { Ok(response_bytes) => { + let response_len = response_bytes.len(); + let elapsed_ms = t0.elapsed().as_millis(); + tracing::info!( + target: "yt_forwarder", + "ok {} {} bytes={} latency_ms={}", + method, url, response_len, elapsed_ms, + ); + // Record BEFORE the downstream write: we want + // `forwarder_calls` to reflect "the path filter + // produced an upstream response," not "the browser + // received it." A client disconnect during write would + // otherwise leave the metric understating fast-path + // utilisation — we'd see only relay-path traffic in + // stats while the forwarder was actually doing work. + fronter.record_forwarder_call(response_len as u64); stream.write_all(&response_bytes).await?; stream.flush().await?; // The forwarder always sets `Connection: close` on the @@ -3213,10 +3234,18 @@ where } Err(e) => { tracing::warn!( - "sni-rewrite forward failed for {}: {} — falling back to relay", - url, - e + target: "yt_forwarder", + "error {} {}: {} (latency_ms={}) — falling back to relay", + method, url, e, t0.elapsed().as_millis(), ); + // `record_forwarder_error` only describes what just + // happened to the fast path. Whether the relay-path + // fallback below recovers the request is reflected in + // `relay_calls` / `relay_failures`; combining those + // with `forwarder_errors` lets diagnostics tell apart + // "fast path missed but request served" from "request + // failed end-to-end." + fronter.record_forwarder_error(); // fall through } } From e91b12367dae4e562a53b8cd405bd4c12e718308 Mon Sep 17 00:00:00 2001 From: dazzling-no-more <278675588+dazzling-no-more@users.noreply.github.com> Date: Sun, 10 May 2026 18:45:12 +0400 Subject: [PATCH 4/5] fix(youtube): SABR keeps first field-3 (single-track buffer fix) --- src/config.rs | 44 +++++---- src/domain_fronter.rs | 216 ++++++++++++++++++++++++++++-------------- 2 files changed, 168 insertions(+), 92 deletions(-) diff --git a/src/config.rs b/src/config.rs index c0bd5480..325123a3 100644 --- a/src/config.rs +++ b/src/config.rs @@ -210,29 +210,31 @@ pub struct Config { #[serde(default)] pub relay_url_patterns: Vec, - /// Strip SABR quality-track entries (top-level field-3 of the - /// segment-fetch protobuf) from `/videoplayback` POST bodies on - /// `*.googlevideo.com` / `*.youtube.com`. Default `true` — this is - /// the upstream-parity behaviour and the fix for "Response too - /// large" 502s on multi-track segment fetches that exceed Apps - /// Script `UrlFetchApp`'s ~10 MB cap (commits 9b6d03e + 33db28a - /// from upstream Python). + /// Strip surplus SABR quality-track entries (top-level field-3 of + /// the segment-fetch protobuf) from `/videoplayback` POST bodies on + /// `*.googlevideo.com` / `*.youtube.com`. Default `true` — fixes + /// "Response too large" 502s on multi-track segment fetches that + /// exceed Apps Script `UrlFetchApp`'s ~10 MB cap (commits 9b6d03e + /// + 33db28a from upstream Python). /// - /// **Why this kill-switch exists** (#977 testing report from - /// `unacoder`, May 2026): under `apps_script` mode with - /// `youtube_via_relay = false`, forcing single-quality-track - /// responses can interact poorly with playback at faster-than-1× - /// speeds (1.7×–2×). Each chunk represents less buffer-ahead - /// duration than a multi-track bundle would, and at speed-up the - /// player drains the buffer faster than the next chunk arrives — - /// reported as "only one videoplayback request is buffered." + /// **Heuristic** (diverges from upstream's "strip all field-3"): + /// the first field-3 entry is always kept; only the 2nd and + /// subsequent ones are stripped, and only when at least one + /// field-2 byte-range entry is present (segment-fetch shape, not + /// session-init). Single-track requests pass through unchanged so + /// googlevideo always has a track selected. This was added in + /// response to #977 testing (unacoder, May 2026): the original + /// strip-all rule turned single-track requests into "zero tracks + /// selected" requests, which googlevideo answered with empty + /// bodies — buffer never advanced, player retried with `rn=` + /// incrementing. /// - /// Flip to `false` if you hit that regression. The trade-off is - /// that long-form videos may then 502 on segments where Google - /// would have bundled multiple quality tracks; the player handles - /// that by falling back to a lower quality. The pre-port behaviour - /// (no SABR strip) was tolerable for most users — the strip is a - /// quality-of-life fix for the specific bundling-blowup case. + /// **When to flip to `false`**: if you still see buffering issues + /// on long-form video playback after the keep-first refinement, + /// turn the strip off entirely to revert to the pre-port behaviour + /// (occasional 502s on multi-track segments → player falls back + /// to a lower quality). The pre-port behaviour was tolerable for + /// most users; the strip is an opt-in quality-of-life fix. #[serde(default = "default_sabr_strip")] pub sabr_strip: bool, diff --git a/src/domain_fronter.rs b/src/domain_fronter.rs index 2c1b7700..18bfb61a 100644 --- a/src/domain_fronter.rs +++ b/src/domain_fronter.rs @@ -3627,40 +3627,58 @@ fn url_host_is_youtube_video_endpoint(url: &str) -> bool { .any(|s| h == *s || h.ends_with(&format!(".{}", s))) } -/// Strip top-level field-3 (quality-track selection) entries from a SABR -/// segment-fetch protobuf body. +/// Strip surplus field-3 (quality-track selection) entries from a SABR +/// segment-fetch protobuf body, keeping the first one intact. /// /// YouTube's SABR (Server-Adaptive Bitrate) `videoplayback` POST bodies /// come in two distinct message shapes: /// /// * **Segment-fetch** — carries field-2 (tag `0x12`) byte-range entries /// for video/audio segments. Field-3 (tag `0x1a`) entries here are -/// quality-track selectors that ask googlevideo to bundle multiple -/// simultaneous quality tracks into one response, easily exceeding -/// Apps Script `UrlFetchApp`'s ~10 MB response buffer → 502. Stripping -/// them forces a single-track response that fits. +/// quality-track selectors that ask googlevideo to return a particular +/// quality track. When the player asks for *multiple* tracks at once +/// (multi-track bundling), googlevideo concatenates them into a single +/// response — easily exceeding Apps Script `UrlFetchApp`'s ~10 MB cap +/// → 502. /// /// * **Session-init** — carries field-5 (tag `0x2a`) entries and *no* /// field-2 entries. Field-3 here is essential session metadata /// (language, viewer state, ...). Stripping it corrupts the init /// handshake → CDN returns 403. /// -/// Heuristic: only strip field-3 when at least one field-2 entry is also -/// present at the top level (segment-fetch shape). Otherwise return the -/// body unchanged. +/// **Heuristic**: +/// +/// 1. Body must be segment-fetch shape (≥ 1 field-2 entry). Otherwise +/// no-op — session-init bodies must not be touched. +/// 2. Body must carry **2 or more** field-3 entries before stripping +/// fires. The first field-3 is always kept; only the 2nd, 3rd, ... +/// are removed. +/// +/// **Why keep the first field-3** (#977, unacoder testing, May 2026): +/// the original "strip all field-3" rule produced empty googlevideo +/// responses on single-track requests — the player asks for ONE track +/// via a sole field-3, we strip it, and the CDN answers a request with +/// zero tracks selected by sending nothing. The player retries +/// indefinitely with the `rn=` retry counter incrementing, never +/// advancing its buffer. Keeping the first field-3 means single-track +/// requests pass through unchanged (no regression at low quality) +/// while multi-track requests still get capped to one track (the +/// 10 MB-blowup fix is preserved). /// /// Only top-level fields are inspected; nested messages are left intact. /// On a malformed body (truncated tag, unknown wire type) the unparsed /// tail is appended verbatim so a corrupt request is never silently -/// truncated. Ported from upstream `_strip_sabr_quality_tracks` -/// (commits 9b6d03e + 33db28a). +/// truncated. Originally ported from upstream +/// `_strip_sabr_quality_tracks` (commits 9b6d03e + 33db28a); the +/// keep-first refinement diverges from upstream based on local testing. pub(crate) fn strip_sabr_quality_tracks(body: &[u8]) -> Vec { // Phase 1: single pass — collect (field_number, start, end) for every - // top-level field. We need to know whether field 2 exists before deciding - // to strip, but a two-pass walk would be wasteful. + // top-level field. We need both the segment-fetch detection (field-2 + // present) AND the field-3 count (≥ 2 to fire) before deciding, + // and a two-pass walk would be wasteful. let mut segments: Vec<(u32, usize, usize)> = Vec::new(); let mut has_field2 = false; - let mut has_field3 = false; + let mut field3_count: usize = 0; let mut i = 0usize; let n = body.len(); let mut tail_start = n; @@ -3778,20 +3796,32 @@ pub(crate) fn strip_sabr_quality_tracks(body: &[u8]) -> Vec { if field_number == 2 { has_field2 = true; } else if field_number == 3 { - has_field3 = true; + field3_count += 1; } segments.push((field_number, seg_start, i)); } - // Phase 2: only strip when this is a segment-fetch body (has field 2) - // AND there's at least one field-3 entry to strip. - if !has_field2 || !has_field3 { + // Phase 2: only strip when this is a segment-fetch body (has field + // 2) AND there are at least 2 field-3 entries — i.e. real multi- + // track bundling. Single-track requests (one field-3) flow through + // unchanged so googlevideo still has a track selected. + if !has_field2 || field3_count < 2 { return body.to_vec(); } + // Keep the first field-3 entry, strip the rest. `field3_kept` + // flips to `true` after the first encounter so subsequent ones + // fall through the strip branch. let mut out = Vec::with_capacity(body.len()); + let mut field3_kept = false; for (field_number, seg_start, seg_end) in segments { - if field_number != 3 { + if field_number == 3 { + if !field3_kept { + field3_kept = true; + out.extend_from_slice(&body[seg_start..seg_end]); + } + // else: strip + } else { out.extend_from_slice(&body[seg_start..seg_end]); } } @@ -6091,10 +6121,32 @@ hello"; } #[test] - fn sabr_strip_segment_fetch_drops_field3() { - // Segment-fetch shape: has both field-2 (range descriptor) and - // field-3 (quality-track selector). Strip should remove only the - // field-3 entries, leaving everything else intact. + fn sabr_strip_keeps_sole_field3_unchanged() { + // The #977 regression case from unacoder's testing: a + // segment-fetch body with exactly ONE field-3 entry (the + // single-track request the player sends at low/medium quality). + // The original "strip all field-3" rule turned this into a + // request with zero tracks selected, which googlevideo answered + // with an empty body — buffer never advanced, player retried + // 11+ times with `rn=` incrementing. Keep-first heuristic + // returns the body unchanged so the player gets a valid + // single-track response. + let mut body: Vec = Vec::new(); + enc_length_delim(&mut body, 2, b"range-descriptor"); + enc_length_delim(&mut body, 3, b"sole-quality-track"); + enc_varint_field(&mut body, 4, 12345); + + // No transform: 1 field-3 < 2-entry threshold. + assert_eq!(strip_sabr_quality_tracks(&body), body); + } + + #[test] + fn sabr_strip_segment_fetch_keeps_first_field3_strips_rest() { + // Segment-fetch shape with TWO field-3 entries (multi-track + // bundling). The first field-3 is kept (preserves a single + // track on the wire so googlevideo has something to send); the + // second is stripped (caps the response under 10 MB). Other + // fields pass through unchanged. let mut body: Vec = Vec::new(); enc_length_delim(&mut body, 2, b"range-descriptor-1"); enc_length_delim(&mut body, 3, b"quality-track-selector-1"); @@ -6104,7 +6156,9 @@ hello"; let mut expected: Vec = Vec::new(); enc_length_delim(&mut expected, 2, b"range-descriptor-1"); + enc_length_delim(&mut expected, 3, b"quality-track-selector-1"); // KEPT enc_length_delim(&mut expected, 2, b"range-descriptor-2"); + // quality-track-selector-2: STRIPPED enc_varint_field(&mut expected, 4, 12345); assert_eq!(strip_sabr_quality_tracks(&body), expected); @@ -6159,21 +6213,20 @@ hello"; } #[test] - fn sabr_strip_truncated_tag_after_field3_preserves_tail() { - // field-2, field-3, then truncated. Strip should remove the - // field-3 entry (segment-fetch shape) and copy the truncated - // tail verbatim. + fn sabr_strip_truncated_tag_after_single_field3_is_noop() { + // field-2 + ONE field-3 (single-track request) + truncated tag. + // Under the keep-first heuristic, single field-3 is preserved + // → strip is a no-op → body returned verbatim (truncated tail + // included). The original behaviour was to strip the sole + // field-3 here, but that's exactly the regression #977 + // identified. let mut body: Vec = Vec::new(); enc_length_delim(&mut body, 2, b"range-desc"); enc_length_delim(&mut body, 3, b"quality-track"); - // truncated tag - body.push(0x80); - - let mut expected: Vec = Vec::new(); - enc_length_delim(&mut expected, 2, b"range-desc"); - expected.push(0x80); + body.push(0x80); // truncated tag - assert_eq!(strip_sabr_quality_tracks(&body), expected); + // No transform — single field-3 < 2-entry threshold. + assert_eq!(strip_sabr_quality_tracks(&body), body); } #[test] @@ -6236,23 +6289,36 @@ hello"; } #[test] - fn sabr_strip_truncated_fixed_width_preserves_segment_verbatim() { - // 64-bit fixed (wire type 1) — only 3 of 8 bytes present. - // Without a length-check this used to clamp via .min(n) and - // declare the field "complete." Now bails at the segment. + fn sabr_strip_truncated_fixed_width_with_single_field3_is_noop() { + // 64-bit fixed (wire type 1) — only 3 of 8 bytes present. The + // bail-on-truncated-payload behaviour is unchanged; what's + // different from the older "strip all field-3" version is + // that a SOLE field-3 is now kept (keep-first heuristic), + // so the body comes back unchanged. let mut body: Vec = Vec::new(); enc_length_delim(&mut body, 2, b"r"); - enc_length_delim(&mut body, 3, b"q"); - // Tag = field 4, wire 1 = 0x21, then only 3 bytes follow. + enc_length_delim(&mut body, 3, b"q"); // sole field-3 → kept + body.push(0x21); + body.extend_from_slice(b"\x01\x02\x03"); + + assert_eq!(strip_sabr_quality_tracks(&body), body); + } + + #[test] + fn sabr_strip_truncated_fixed_width_with_two_field3_strips_extras() { + // Same fixed-width-truncation shape, but with TWO field-3 + // entries. Keep-first rule fires: first field-3 kept, second + // stripped, malformed tail verbatim. + let mut body: Vec = Vec::new(); + enc_length_delim(&mut body, 2, b"r"); + enc_length_delim(&mut body, 3, b"q1"); + enc_length_delim(&mut body, 3, b"q2"); // stripped body.push(0x21); body.extend_from_slice(b"\x01\x02\x03"); - // The malformed tail starts at the truncated fixed-width tag, - // so the field-2 / field-3 we did parse get emitted (segment- - // fetch shape, field-3 stripped), then the tail verbatim. let mut expected: Vec = Vec::new(); enc_length_delim(&mut expected, 2, b"r"); - // field-3 stripped here + enc_length_delim(&mut expected, 3, b"q1"); // kept expected.push(0x21); expected.extend_from_slice(b"\x01\x02\x03"); assert_eq!(strip_sabr_quality_tracks(&body), expected); @@ -6262,25 +6328,30 @@ hello"; // ── SABR kill-switch runtime gate (#977) ───────────────────────────── - /// Build a known segment-fetch body (field-2 + field-3) that the - /// strip would actually shrink — used to prove the gate at runtime - /// rather than just the config-default round-trip. + /// Build a known segment-fetch body that the strip would actually + /// shrink — multi-track shape (field-2 + 2× field-3) so the + /// keep-first heuristic fires and removes the second field-3. + /// Used to prove the gate at runtime rather than just the + /// config-default round-trip. fn segment_fetch_body() -> Vec { let mut body: Vec = Vec::new(); enc_length_delim(&mut body, 2, b"range-descriptor"); - enc_length_delim(&mut body, 3, b"quality-track-selector"); + enc_length_delim(&mut body, 3, b"quality-track-selector-1"); + enc_length_delim(&mut body, 3, b"quality-track-selector-2"); body } #[test] - fn sabr_strip_on_strips_segment_fetch_body_via_relay_gate() { - // sabr_strip = true (default): segment-fetch POST to a real - // googlevideo URL is stripped. This protects the main behaviour - // the kill-switch gates — if a future refactor accidentally - // drops the `self.sabr_strip` check from `relay()`, the strip - // would still apply on `true` and the test would pass; if the - // refactor accidentally INVERTS the check, this test fails - // because no bytes are removed. + fn sabr_strip_on_strips_extra_field3_entries_via_relay_gate() { + // sabr_strip = true (default), multi-track segment-fetch body + // (the keep-first heuristic threshold). The first field-3 + // entry must survive (so the player still has a track selected + // — the #977 lesson); subsequent field-3 entries must be gone + // (the 10 MB-blowup fix). Protects the main behaviour the + // kill-switch gates: if a future refactor drops the + // `self.sabr_strip` check, the strip still applies on `true` + // and the test passes; if the refactor inverts the check, this + // fails because no bytes are removed. let fronter = fronter_for_test_with(false, true); let body = segment_fetch_body(); let result = fronter.maybe_strip_sabr_body( @@ -6288,18 +6359,25 @@ hello"; "https://rrx---sn-xxx.googlevideo.com/videoplayback?id=42", &body, ); - let stripped = result.expect("sabr_strip=true must strip a segment-fetch body"); + let stripped = result.expect("sabr_strip=true must strip a multi-track body"); assert!( stripped.len() < body.len(), "strip must remove at least one byte ({} -> {})", body.len(), stripped.len(), ); - // And the field-3 entry specifically is what was removed. + // First field-3 kept (single-track preservation), second stripped. + assert!( + stripped + .windows(b"quality-track-selector-1".len()) + .any(|w| w == b"quality-track-selector-1"), + "first field-3 payload (quality-track-selector-1) must SURVIVE the strip", + ); assert!( - !stripped.windows(b"quality-track-selector".len()) - .any(|w| w == b"quality-track-selector"), - "field-3 payload must be gone from stripped body", + !stripped + .windows(b"quality-track-selector-2".len()) + .any(|w| w == b"quality-track-selector-2"), + "subsequent field-3 payload (quality-track-selector-2) must be STRIPPED", ); } @@ -6422,21 +6500,17 @@ hello"; } #[test] - fn sabr_strip_truncated_varint_payload_preserves_segment_verbatim() { - // Wire type 0 (varint) with a continuation byte and no terminator. + fn sabr_strip_truncated_varint_payload_with_single_field3_is_noop() { + // Wire type 0 (varint) with a continuation byte and no + // terminator. Sole field-3 → kept under keep-first heuristic + // → body returned verbatim (truncated tail included). let mut body: Vec = Vec::new(); enc_length_delim(&mut body, 2, b"r"); - enc_length_delim(&mut body, 3, b"q"); - // Tag = field 5, wire 0 = 0x28, then a continuation byte that - // never terminates. + enc_length_delim(&mut body, 3, b"q"); // sole field-3 → kept body.push(0x28); body.push(0x80); // continuation, then EOF - let mut expected: Vec = Vec::new(); - enc_length_delim(&mut expected, 2, b"r"); - expected.push(0x28); - expected.push(0x80); - assert_eq!(strip_sabr_quality_tracks(&body), expected); + assert_eq!(strip_sabr_quality_tracks(&body), body); } // ── StatsSnapshot::fmt_line + to_json (forwarder fields) ──────────── From c105755b1b437338964be74ca953dcd7e9c57cdf Mon Sep 17 00:00:00 2001 From: dazzling-no-more <278675588+dazzling-no-more@users.noreply.github.com> Date: Sun, 10 May 2026 21:20:37 +0400 Subject: [PATCH 5/5] =?UTF-8?q?fix(youtube):=20default=20sabr=5Fstrip=20to?= =?UTF-8?q?=20false=20(#977=20=E2=80=94=20opt-in=20after=20testing)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java/com/therealaleph/mhrv/ConfigStore.kt | 19 ++-- src/bin/ui.rs | 9 +- src/config.rs | 103 ++++++++++++------ 3 files changed, 83 insertions(+), 48 deletions(-) diff --git a/android/app/src/main/java/com/therealaleph/mhrv/ConfigStore.kt b/android/app/src/main/java/com/therealaleph/mhrv/ConfigStore.kt index d42f6eb9..fa87eaa9 100644 --- a/android/app/src/main/java/com/therealaleph/mhrv/ConfigStore.kt +++ b/android/app/src/main/java/com/therealaleph/mhrv/ConfigStore.kt @@ -160,11 +160,12 @@ data class MhrvConfig( val youtubeViaRelay: Boolean = false, /** - * SABR quality-track strip kill-switch (Rust `sabr_strip`). - * Default true. See `src/config.rs` `sabr_strip` for the trade-off - * and when to flip — Android-side is just round-trip plumbing. + * SABR quality-track strip — opt-in (Rust `sabr_strip`, default + * false after #977 testing). See `src/config.rs` `sabr_strip` for + * the full reasoning and when to flip on. Android-side is just + * round-trip plumbing. */ - val sabrStrip: Boolean = true, + val sabrStrip: Boolean = false, /** * Path-pinned relay routing (Rust `relay_url_patterns`). @@ -256,10 +257,10 @@ data class MhrvConfig( put("tunnel_doh", tunnelDoh) put("block_doh", blockDoh) if (youtubeViaRelay) put("youtube_via_relay", true) - // sabr_strip default is true on the Rust side; emit only - // when the user has explicitly disabled it so unchanged - // configs stay clean. #977 kill-switch. - if (!sabrStrip) put("sabr_strip", false) + // sabr_strip default is false on the Rust side (opt-in + // after #977); emit only when the user has explicitly + // enabled it so unchanged configs stay clean. + if (sabrStrip) put("sabr_strip", true) // Trim/drop-empty/dedupe before serializing — same pattern // as bypass_doh_hosts. Skip the key entirely when the user // hasn't added any extras so we don't leak an empty array @@ -495,7 +496,7 @@ object ConfigStore { tunnelDoh = obj.optBoolean("tunnel_doh", true), blockDoh = obj.optBoolean("block_doh", true), youtubeViaRelay = obj.optBoolean("youtube_via_relay", false), - sabrStrip = obj.optBoolean("sabr_strip", true), + sabrStrip = obj.optBoolean("sabr_strip", false), bypassDohHosts = obj.optJSONArray("bypass_doh_hosts")?.let { arr -> buildList { for (i in 0 until arr.length()) add(arr.optString(i)) } }?.filter { it.isNotBlank() }.orEmpty(), diff --git a/src/bin/ui.rs b/src/bin/ui.rs index 85626a3c..759116c3 100644 --- a/src/bin/ui.rs +++ b/src/bin/ui.rs @@ -433,7 +433,7 @@ fn load_form() -> (FormState, Option) { normalize_x_graphql: false, youtube_via_relay: false, relay_url_patterns: Vec::new(), - sabr_strip: true, + sabr_strip: false, passthrough_hosts: Vec::new(), block_quic: true, disable_padding: false, @@ -684,9 +684,10 @@ struct ConfigWire<'a> { /// the proxy-applied default isn't echoed into config.json. #[serde(skip_serializing_if = "Vec::is_empty")] relay_url_patterns: &'a Vec, - /// See `config::Config::sabr_strip`. Default `true`; emitted only - /// when explicitly disabled so unchanged configs stay clean. - #[serde(skip_serializing_if = "is_true")] + /// See `config::Config::sabr_strip`. Default `false` (opt-in + /// after #977); emitted only when explicitly enabled so unchanged + /// configs stay clean. + #[serde(skip_serializing_if = "is_false")] sabr_strip: bool, #[serde(skip_serializing_if = "Vec::is_empty")] passthrough_hosts: &'a Vec, diff --git a/src/config.rs b/src/config.rs index 325123a3..45605cd1 100644 --- a/src/config.rs +++ b/src/config.rs @@ -212,29 +212,35 @@ pub struct Config { /// Strip surplus SABR quality-track entries (top-level field-3 of /// the segment-fetch protobuf) from `/videoplayback` POST bodies on - /// `*.googlevideo.com` / `*.youtube.com`. Default `true` — fixes - /// "Response too large" 502s on multi-track segment fetches that - /// exceed Apps Script `UrlFetchApp`'s ~10 MB cap (commits 9b6d03e - /// + 33db28a from upstream Python). + /// `*.googlevideo.com` / `*.youtube.com`. **Default `false` — + /// opt-in.** The original use case was fixing "Response too large" + /// 502s on multi-track segment fetches that exceed Apps Script + /// `UrlFetchApp`'s ~10 MB cap (commits 9b6d03e + 33db28a from + /// upstream Python). /// - /// **Heuristic** (diverges from upstream's "strip all field-3"): - /// the first field-3 entry is always kept; only the 2nd and - /// subsequent ones are stripped, and only when at least one - /// field-2 byte-range entry is present (segment-fetch shape, not - /// session-init). Single-track requests pass through unchanged so - /// googlevideo always has a track selected. This was added in - /// response to #977 testing (unacoder, May 2026): the original - /// strip-all rule turned single-track requests into "zero tracks - /// selected" requests, which googlevideo answered with empty - /// bodies — buffer never advanced, player retried with `rn=` - /// incrementing. + /// **Why off by default** (#977 testing, unacoder, May 2026): + /// stripping field-3 entries broke video playback in the field + /// across two iterations of the heuristic. The strip-all variant + /// produced empty googlevideo responses on single-track requests + /// (player retried indefinitely with `rn=` incrementing); the + /// keep-first refinement still broke playback even on single-track + /// 1080p60 default-config tests. The most plausible explanation is + /// that field-3 entries aren't homogeneous quality-track selectors + /// — they likely encode multiple request facets (audio/video + /// tracks, init-segment refs) and stripping any of them produces + /// responses the player can't splice into its buffer. Without a + /// captured `/videoplayback` request body and proto reflection we + /// can't design a correct heuristic, so default-off ships safe + /// behaviour for the unbroken-playback common case. /// - /// **When to flip to `false`**: if you still see buffering issues - /// on long-form video playback after the keep-first refinement, - /// turn the strip off entirely to revert to the pre-port behaviour - /// (occasional 502s on multi-track segments → player falls back - /// to a lower quality). The pre-port behaviour was tolerable for - /// most users; the strip is an opt-in quality-of-life fix. + /// **When to flip on**: if you specifically hit "Response too + /// large" 502s on long-form videos at high quality (1080p+ on + /// long playlists is the usual case). The opt-in behaviour uses + /// the keep-first heuristic — strictly less aggressive than + /// upstream's strip-all, so it's the safer of the two flavours. + /// Accept that some videos may still not play correctly with the + /// strip on; you're trading "occasional 502s" for "occasional + /// broken segments." Most users should leave this off. #[serde(default = "default_sabr_strip")] pub sabr_strip: bool, @@ -604,11 +610,16 @@ fn default_auto_blacklist_cooldown_secs() -> u64 { 120 } /// hard-coded `BATCH_TIMEOUT` and Apps Script's typical response cliff. fn default_request_timeout_secs() -> u64 { 30 } -/// Default for `sabr_strip`: `true`. Matches upstream-parity behaviour -/// and the headline "no more 10 MB UrlFetchApp 502s on multi-track -/// segment fetches" win. Users hitting the speed-up-playback buffering -/// regression (#977) can opt out with `sabr_strip: false`. -fn default_sabr_strip() -> bool { true } +/// Default for `sabr_strip`: `false`. Flipped from `true` after #977 +/// testing — both strip-all and keep-first variants of the heuristic +/// broke video playback in the field, including on single-track +/// default-config tests at 1080p60. Without proto reflection on a +/// captured `/videoplayback` body we can't design a correct heuristic, +/// so default-off ships the unbroken-playback common case. Users who +/// specifically hit "Response too large" 502s on long-form 1080p+ +/// videos can opt in with `sabr_strip: true` (uses the keep-first +/// flavour, less aggressive than upstream's strip-all). +fn default_sabr_strip() -> bool { false } fn default_google_ip() -> String { "216.239.38.120".into() @@ -1168,23 +1179,45 @@ mod tests { } #[test] - fn sabr_strip_defaults_to_true_when_omitted() { - // Existing configs from before the kill-switch landed don't - // have the field. `serde(default = "default_sabr_strip")` must - // resolve to true so the upstream-parity behaviour is what - // unchanged users get. + fn sabr_strip_defaults_to_false_when_omitted() { + // After the #977 flip: `serde(default = "default_sabr_strip")` + // must resolve to false so configs without the field get the + // safe (unbroken-playback) common case. Existing configs that + // never had the field continue to work — they just don't get + // the strip applied. let s = r#"{ "mode": "apps_script", "auth_key": "secret-test-secret-test", "script_id": "X" }"#; let cfg: Config = serde_json::from_str(s).unwrap(); - assert!(cfg.sabr_strip, "default must be true (strip enabled)"); + assert!( + !cfg.sabr_strip, + "default must be false (strip opt-in, default off)" + ); + } + + #[test] + fn sabr_strip_round_trips_explicit_true_for_opt_in() { + // Users specifically hitting "Response too large" 502s on + // long-form high-quality videos can opt in with sabr_strip: + // true. The keep-first heuristic kicks in only on multi-track + // bodies — single-track requests pass through untouched. + let s = r#"{ + "mode": "apps_script", + "auth_key": "secret-test-secret-test", + "script_id": "X", + "sabr_strip": true + }"#; + let cfg: Config = serde_json::from_str(s).unwrap(); + assert!(cfg.sabr_strip, "explicit true must round-trip for opt-in users"); } #[test] - fn sabr_strip_round_trips_explicit_false() { - // Users hitting the #977 buffering regression flip this to false. + fn sabr_strip_round_trips_explicit_false_explicitly() { + // Round-trip the explicit-false case too — `false` is the + // default but a user might write it out for clarity, and we + // shouldn't lose information either way. let s = r#"{ "mode": "apps_script", "auth_key": "secret-test-secret-test", @@ -1192,7 +1225,7 @@ mod tests { "sabr_strip": false }"#; let cfg: Config = serde_json::from_str(s).unwrap(); - assert!(!cfg.sabr_strip, "explicit false must round-trip"); + assert!(!cfg.sabr_strip); } #[test]