diff --git a/app/admin/users/[id]/page.tsx b/app/admin/users/[id]/page.tsx index c380ccd..e282c6d 100644 --- a/app/admin/users/[id]/page.tsx +++ b/app/admin/users/[id]/page.tsx @@ -1,6 +1,7 @@ import { notFound } from 'next/navigation'; import Link from 'next/link'; import { getSupabaseAdmin } from '@/lib/supabase/admin'; +import { withAdminRetry } from '@/lib/admin/retry'; import { createSupabaseServerClient } from '@/lib/supabase/server'; import { isUuid } from '@/lib/admin/validators'; import { formatBytes, formatDate } from '@/lib/format'; @@ -41,8 +42,8 @@ export default async function AdminUserDetailPage({ data: { user: currentUser }, } = await supabase.auth.getUser(); - const { data: userRes, error } = await admin.auth.admin.getUserById( - params.id, + const { data: userRes, error } = await withAdminRetry(() => + admin.auth.admin.getUserById(params.id), ); if (error || !userRes.user) notFound(); const u = userRes.user; diff --git a/app/api/admin/users/[id]/route.ts b/app/api/admin/users/[id]/route.ts index 5f3637c..5a2ea58 100644 --- a/app/api/admin/users/[id]/route.ts +++ b/app/api/admin/users/[id]/route.ts @@ -1,6 +1,7 @@ import { type NextRequest } from 'next/server'; import { requireAdminApi } from '@/lib/auth/admin-guard'; import { getSupabaseAdmin } from '@/lib/supabase/admin'; +import { withAdminRetry } from '@/lib/admin/retry'; import { logAdminAction } from '@/lib/auth/audit'; import { isUuid } from '@/lib/admin/validators'; import { jsonNoStore } from '@/lib/admin/response'; @@ -33,8 +34,12 @@ export async function GET( const admin = getSupabaseAdmin(); - const { data: userRes, error: userErr } = - await admin.auth.admin.getUserById(id); + // Retry transient empty-body GoTrue responses so a burst-induced flake isn't + // misreported as a 404 for a user that actually exists. A genuine not-found + // (non-transient) still falls through to the clean 404 below. + const { data: userRes, error: userErr } = await withAdminRetry(() => + admin.auth.admin.getUserById(id), + ); if (userErr || !userRes.user) { return jsonNoStore({ error: 'user not found' }, { status: 404 }); } @@ -103,9 +108,11 @@ export async function DELETE( // Confirm the user exists up front. GoTrue replies with an empty body when // deleting a non-existent user, which supabase-js surfaces as an opaque // JSON-parse error (no status / "not found" text); a positive existence - // check lets us return a clean 404 instead of a misleading 500. - const { data: existing, error: lookupErr } = - await admin.auth.admin.getUserById(id); + // check lets us return a clean 404 instead of a misleading 500. Retry the + // lookup so a transient empty body under load isn't mistaken for not-found. + const { data: existing, error: lookupErr } = await withAdminRetry(() => + admin.auth.admin.getUserById(id), + ); if (lookupErr || !existing.user) { return jsonNoStore({ error: 'user not found' }, { status: 404 }); } diff --git a/lib/admin/list.ts b/lib/admin/list.ts index 61bbbdd..3607ae6 100644 --- a/lib/admin/list.ts +++ b/lib/admin/list.ts @@ -1,5 +1,6 @@ import type { User } from '@supabase/supabase-js'; import { getSupabaseAdmin } from '@/lib/supabase/admin'; +import { withAdminRetry } from '@/lib/admin/retry'; import { parseOrder, parseSort, @@ -128,10 +129,14 @@ export async function getUsersList(opts: { // paginate the filtered+sorted set. const matched: User[] = []; for (let p = 1; p <= USER_SCAN_MAX_PAGES; p++) { - const { data, error } = await admin.auth.admin.listUsers({ - page: p, - perPage: USER_SCAN_PER_PAGE, - }); + // Retry transient empty-body GoTrue responses so a burst-induced flake + // doesn't abort the full directory scan mid-way. + const { data, error } = await withAdminRetry(() => + admin.auth.admin.listUsers({ + page: p, + perPage: USER_SCAN_PER_PAGE, + }), + ); if (error) throw new Error(error.message); const us = data.users; if (us.length === 0) break; @@ -147,8 +152,12 @@ export async function getUsersList(opts: { const from = (page - 1) * perPage; pageUsers = matched.slice(from, from + perPage); } else { - // Common no-search, default-sort path: cheap single-page lookup. - const { data, error } = await admin.auth.admin.listUsers({ page, perPage }); + // Common no-search, default-sort path: cheap single-page lookup. Retry + // transient empty-body responses so the post-mutation auto-refresh that + // hits this path doesn't intermittently 500. + const { data, error } = await withAdminRetry(() => + admin.auth.admin.listUsers({ page, perPage }), + ); if (error) throw new Error(error.message); pageUsers = data.users; total = (data as unknown as { total?: number }).total ?? pageUsers.length; @@ -281,10 +290,16 @@ export async function getTunnelsList(opts: { } // Resolve owner emails (per-row getUserById; acceptable for current scale). + // The user_id comes from an existing tunnel row, so an empty body here is a + // transient burst flake rather than a genuine not-found — retry it. The + // try/catch null fallback remains as a last resort so one bad row can never + // 500 the whole list (it surfaces as "—" only if every retry still fails). const emails = await Promise.all( rows.map(async (t) => { try { - const { data: u } = await admin.auth.admin.getUserById(t.user_id); + const { data: u } = await withAdminRetry(() => + admin.auth.admin.getUserById(t.user_id), + ); return u.user?.email ?? null; } catch { return null; diff --git a/lib/admin/retry.ts b/lib/admin/retry.ts new file mode 100644 index 0000000..9c75f60 --- /dev/null +++ b/lib/admin/retry.ts @@ -0,0 +1,128 @@ +/** + * Retry helper for GoTrue admin reads (`auth.admin.listUsers` / + * `auth.admin.getUserById`). + * + * Under a rapid burst of admin mutations (bulk ban/unban/delete) immediately + * followed by the auto list-refresh, GoTrue's admin endpoints intermittently + * return an EMPTY or TRUNCATED HTTP body. supabase-js then fails to parse the + * response and throws `Unexpected end of JSON input` (or returns an + * empty/transient `error`). The underlying request actually succeeded a moment + * later, so a small bounded retry turns these flaky failures into reliable + * reads without changing happy-path behaviour. + * + * IMPORTANT: only TRANSIENT failures are retried. Legitimate not-found (404) and + * validation (4xx) errors are returned immediately so genuine failures still + * surface as proper 4xx/5xx responses upstream. + */ + +// Up to 3 attempts total (1 initial + 2 retries). Delays are applied BEFORE the +// 2nd and 3rd attempts respectively, so worst-case added latency is ~350ms — +// kept well under a second to keep the admin surface snappy. +const MAX_ATTEMPTS = 3; +const RETRY_DELAYS_MS = [100, 250]; + +/** Loose shape that both `AuthError` and `PostgrestError` satisfy. */ +type MaybeError = + | { message?: string | null; status?: number | null; code?: string | number | null } + | null + | undefined; + +export type RetryOptions = { + /** Total attempts including the first. Defaults to {@link MAX_ATTEMPTS}. */ + attempts?: number; + /** Backoff delays (ms) applied before each retry. Defaults to {@link RETRY_DELAYS_MS}. */ + delaysMs?: number[]; +}; + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +/** True when a thrown/parse error looks like an empty/truncated-body failure. */ +function isTransientMessage(message: string): boolean { + const m = message.toLowerCase(); + return ( + m.includes('unexpected end of json input') || + m.includes('unexpected end of input') || + m.includes('unexpected end of data') || + // Some runtimes phrase the empty-body parse failure differently. + (m.includes('json') && m.includes('parse') && m.includes('unexpected')) || + // Low-level network blips that also warrant a quick retry. + m.includes('fetch failed') || + m.includes('network') || + m.includes('econnreset') || + m.includes('socket hang up') || + m.includes('terminated') + ); +} + +/** True when a thrown value is a transient, retryable failure. */ +function isTransientThrow(err: unknown): boolean { + if (err instanceof Error) return isTransientMessage(err.message); + if (typeof err === 'string') return isTransientMessage(err); + return false; +} + +/** + * True when a returned supabase-js `error` is transient. We retry on 5xx and on + * empty/parse-style messages, but NEVER on legitimate not-found/validation + * (e.g. a 404 with a real "User not found" message). + */ +function isTransientError(error: MaybeError): boolean { + if (!error) return false; + const status = typeof error.status === 'number' ? error.status : undefined; + // Explicit client/validation/not-found statuses are genuine — do not retry. + if (status !== undefined && status >= 400 && status < 500) return false; + if (status !== undefined && status >= 500) return true; + const message = typeof error.message === 'string' ? error.message : ''; + if (message && isTransientMessage(message)) return true; + // An error object with neither a usable status nor message is treated as an + // opaque/empty transient failure worth one more try. + if (!status && !message) return true; + return false; +} + +/** + * Await an async supabase-js admin call that returns `{ data, error }` (or that + * may throw), retrying only on transient empty-body / network failures. + * + * Returns the successful (or genuinely-failed, non-transient) result. After + * exhausting all attempts it returns the last `{ data, error }` result or + * re-throws the last thrown error, so persistent failures still surface. + */ +export async function withAdminRetry( + fn: () => Promise, + opts?: RetryOptions, +): Promise { + const attempts = opts?.attempts ?? MAX_ATTEMPTS; + const delays = opts?.delaysMs ?? RETRY_DELAYS_MS; + + let lastResult: R | undefined; + let lastThrown: unknown; + let threw = false; + + for (let attempt = 1; attempt <= attempts; attempt++) { + try { + const result = await fn(); + threw = false; + lastResult = result; + // Success, or a genuine (non-transient) error — return as-is. + if (!isTransientError(result.error)) return result; + } catch (err) { + // A non-transient throw is a real failure: surface it immediately. + if (!isTransientThrow(err)) throw err; + threw = true; + lastThrown = err; + } + + if (attempt < attempts) { + const delay = delays[attempt - 1] ?? delays[delays.length - 1] ?? 0; + if (delay > 0) await sleep(delay); + } + } + + // Exhausted all attempts on transient failures: surface the last outcome so + // the caller still sees a real error rather than a masked success. + if (threw) throw lastThrown; + return lastResult as R; +}