security hardening

This commit is contained in:
Gerhard Scheikl
2026-05-31 09:35:31 +02:00
parent d7d437a871
commit 01b4734477
31 changed files with 1234 additions and 238 deletions
+103 -12
View File
@@ -1,27 +1,118 @@
import pLimit from "p-limit";
import type { WebhookReservation } from "./dedupe.server";
/**
* Fire-and-forget runner for webhook side-effects.
* Background runner for webhook side-effects.
*
* Shopify expects a 200 response within ~5 seconds, otherwise it considers
* the delivery failed and retries it. Heavy automation work (PDF render,
* Shopify Files upload, SMTP send) routinely exceeded that budget, which
* caused duplicate invoice emails before we added the dedupe table.
*
* Returning the response immediately and letting the work finish in the
* background keeps Shopify happy. Combined with the dedupe table this is
* defence-in-depth: dedupe ensures *correctness* even if a retry sneaks
* through, while async processing makes retries unlikely in the first
* place.
* Returning the response immediately and finishing the work afterwards keeps
* Shopify happy. Two problems with a naive `void work()`:
*
* Errors are caught and logged \u2014 they cannot reach a dispatcher because
* the HTTP response is already gone.
* 1. DoS / resource exhaustion — an order burst would spawn unbounded
* concurrent PDF renders + SMTP sends. We cap concurrency with a small
* in-process queue (`p-limit`); excess tasks queue instead of piling up.
* 2. Data loss on restart — `void work()` is invisible to shutdown, so a
* container stop (SIGTERM) killed in-flight invoice work mid-send. We
* track in-flight tasks and drain them (bounded) on SIGTERM/SIGINT.
*
* Reserve/commit dedupe (see dedupe.server.ts) is integrated here: on success
* we `commit()` the reservation (permanently deduped); on failure we
* `release()` it so Shopify's retry re-runs the work instead of being dropped
* as a duplicate.
*/
const CONCURRENCY = Math.max(1, Number(process.env.WEBHOOK_CONCURRENCY) || 4);
const DRAIN_TIMEOUT_MS = Math.max(
1000,
Number(process.env.WEBHOOK_DRAIN_TIMEOUT_MS) || 25_000,
);
const limit = pLimit(CONCURRENCY);
const inFlight = new Set<Promise<unknown>>();
let draining = false;
export function runWebhookInBackground(
description: string,
work: () => Promise<unknown>,
reservation?: WebhookReservation | null,
): void {
// `void` so we don't accidentally `await` the floating promise; the
// node event loop keeps the task alive until it settles.
void work().catch((err) => {
console.error(`background webhook task '${description}' failed:`, err);
if (draining) {
// The process is shutting down. We still enqueue so the drain awaits this
// task — the server has already stopped listening, so this is at most the
// tail end of the last accepted request.
console.warn(`[webhook-queue] enqueuing task during shutdown drain: ${description}`);
}
const task = limit(async () => {
try {
await work();
await reservation?.commit();
} catch (err) {
console.error(`background webhook task '${description}' failed:`, err);
// Drop the dedupe reservation so Shopify's retry re-runs the work.
try {
await reservation?.release();
} catch (releaseErr) {
console.error(
`background webhook task '${description}': failed to release dedupe reservation:`,
releaseErr,
);
}
}
});
inFlight.add(task);
void task.finally(() => inFlight.delete(task));
}
/**
* Stop accepting new work (best-effort) and await in-flight + queued tasks,
* bounded by `timeoutMs`, so a container stop drains invoice work instead of
* killing it mid-send. Idempotent.
*/
export async function drainWebhookQueue(timeoutMs = DRAIN_TIMEOUT_MS): Promise<void> {
draining = true;
if (inFlight.size === 0) return;
console.log(
`[webhook-queue] draining ${inFlight.size} in-flight webhook task(s) (timeout ${timeoutMs}ms)...`,
);
let timer: ReturnType<typeof setTimeout> | undefined;
const timeout = new Promise<void>((resolve) => {
timer = setTimeout(resolve, timeoutMs);
if (typeof timer.unref === "function") timer.unref();
});
await Promise.race([Promise.allSettled([...inFlight]), timeout]);
if (timer) clearTimeout(timer);
if (inFlight.size > 0) {
console.warn(
`[webhook-queue] drain timed out with ${inFlight.size} task(s) still running`,
);
} else {
console.log("[webhook-queue] drain complete");
}
}
// Bridge for the custom server (server.js), which loads only the bundled
// build and cannot import this module directly. It awaits this drain before
// calling process.exit during graceful shutdown.
type DrainGlobal = typeof globalThis & {
__linumiqWebhookDrain?: typeof drainWebhookQueue;
};
(globalThis as DrainGlobal).__linumiqWebhookDrain = drainWebhookQueue;
// Safety net for runtimes that don't go through server.js (e.g. `shopify app
// dev`): stop accepting work and best-effort drain. The custom server awaits
// the same (idempotent) drain before exiting.
for (const signal of ["SIGTERM", "SIGINT"] as const) {
process.once(signal, () => {
void drainWebhookQueue();
});
}
+63
View File
@@ -0,0 +1,63 @@
import db from "../../db.server";
/**
* Periodic TTL cleanup for the `ProcessedWebhook` idempotency table.
*
* The table grows by one row per Shopify webhook delivery and is never read
* after the retry window closes, so without pruning it grows unbounded —
* eventually a disk/space DoS. We only need rows for as long as Shopify might
* retry a delivery (hours), so a generous retention window of a few days is
* ample while keeping the table small.
*/
const RETENTION_MS = 7 * 24 * 60 * 60 * 1000; // 7 days
const INTERVAL_MS = 60 * 60 * 1000; // hourly
export interface CleanupDeps {
db: {
processedWebhook: {
deleteMany: (args: {
where: { receivedAt: { lt: Date } };
}) => Promise<{ count: number }>;
};
};
}
let scheduled = false;
async function runCleanup(deps: CleanupDeps): Promise<void> {
try {
const cutoff = new Date(Date.now() - RETENTION_MS);
const { count } = await deps.db.processedWebhook.deleteMany({
where: { receivedAt: { lt: cutoff } },
});
if (count > 0) {
console.log(`webhook-cleanup: removed ${count} ProcessedWebhook row(s) older than 7d`);
}
} catch (err) {
// Best-effort housekeeping — never throw into the caller.
console.warn("webhook-cleanup: prune failed:", err);
}
}
/**
* Idempotently schedule the hourly cleanup. Safe to call on every webhook —
* the first call starts a single unref'd interval and runs an immediate
* sweep; subsequent calls are no-ops.
*
* Because this is only ever invoked while handling a live webhook request, it
* never runs during `prisma generate` / `react-router build` or other CLI
* contexts. The interval is `unref`'d so it can never keep the process alive.
*/
export function ensureWebhookCleanupScheduled(deps: CleanupDeps = { db }): void {
if (scheduled) return;
scheduled = true;
const timer = setInterval(() => {
void runCleanup(deps);
}, INTERVAL_MS);
// Don't let the housekeeping interval keep the event loop alive on shutdown.
if (typeof timer.unref === "function") timer.unref();
// Kick off an immediate sweep so a long-lived process prunes promptly.
void runCleanup(deps);
}
+174 -37
View File
@@ -1,67 +1,204 @@
import db from "../../db.server";
import { ensureWebhookCleanupScheduled } from "./cleanup.server";
/**
* Minimal shape of the Prisma client surface we use — declared inline so
* the helper can be unit-tested with a tiny stub instead of pulling in a
* real database.
* How long a `status="processing"` reservation is considered "live" before we
* assume the worker that claimed it crashed mid-process. After this window a
* stale reservation may be reclaimed and the work retried.
*/
const STALE_LEASE_MS = 5 * 60 * 1000; // 5 minutes
interface ProcessedRow {
webhookId: string;
status: string;
receivedAt: Date;
}
/**
* Minimal shape of the Prisma client surface we use — declared inline so the
* helper can be unit-tested with a tiny stub instead of a real database.
*/
export interface DedupeDeps {
db: {
processedWebhook: {
create: (args: {
data: { webhookId: string; topic: string; shopDomain: string };
data: { webhookId: string; topic: string; shopDomain: string; status: string };
}) => Promise<unknown>;
findUnique: (args: { where: { webhookId: string } }) => Promise<ProcessedRow | null>;
update: (args: {
where: { webhookId: string };
data: { status?: string; receivedAt?: Date };
}) => Promise<unknown>;
delete: (args: { where: { webhookId: string } }) => Promise<unknown>;
};
};
}
/**
* Returns `true` when this Shopify webhook delivery has already been
* processed and the caller should short-circuit without doing the work.
* A claim on a single Shopify webhook delivery. Obtained from
* {@link reserveWebhook}. The caller MUST eventually `commit()` (work
* succeeded — the delivery is permanently deduped) or `release()` (work
* failed — drop the reservation so Shopify's retry re-runs the work).
*
* Shopify retries webhook deliveries when it doesn't receive a 200 within
* its (~5s) timeout window. Without dedupe this caused us to email an
* invoice twice for the same order: the first slow delivery completed its
* work but Shopify timed out and re-sent the webhook, which then ran the
* automation a second time.
*
* We key on the `X-Shopify-Webhook-Id` header — Shopify guarantees the same
* value for retries of the same delivery, but a new value for genuinely
* new events. The insert is the lock: a unique-constraint violation
* (Prisma error code `P2002`) means another delivery already claimed this
* id.
* `commit`/`release` are no-ops for reservations without a webhook id (unit
* tests / non-Shopify callers) and for the fail-open path.
*/
export async function isDuplicateWebhook(
export interface WebhookReservation {
webhookId: string | null;
commit: () => Promise<void>;
release: () => Promise<void>;
}
function noopReservation(webhookId: string | null): WebhookReservation {
return {
webhookId,
commit: async () => {},
release: async () => {},
};
}
function isP2002(err: unknown): boolean {
// Duck-typed so callers can stub the db without pulling in the real
// `Prisma` namespace. P2002 = unique-constraint violation.
return (err as { code?: string } | null)?.code === "P2002";
}
function makeReservation(
webhookId: string,
shop: string,
topic: string,
deps: DedupeDeps,
): WebhookReservation {
return {
webhookId,
commit: async () => {
try {
await deps.db.processedWebhook.update({
where: { webhookId },
data: { status: "done" },
});
} catch (err) {
// The work already succeeded; a failed commit just risks a later
// duplicate (which the side-effect code is expected to tolerate).
console.warn(`dedupe: failed to commit webhook ${webhookId} (${topic}/${shop}):`, err);
}
},
release: async () => {
try {
await deps.db.processedWebhook.delete({ where: { webhookId } });
} catch (err) {
console.warn(`dedupe: failed to release webhook ${webhookId} (${topic}/${shop}):`, err);
}
},
};
}
/**
* Reserve this Shopify webhook delivery for processing.
*
* Shopify retries a delivery (re-using the same `X-Shopify-Webhook-Id`) when
* it doesn't receive a 200 within its ~5s timeout. Naively recording the id as
* "processed" *before* doing the work meant that if the heavy background work
* later failed (SMTP/GraphQL/PDF error), Shopify's retry was dropped as a
* duplicate and the invoice was never sent.
*
* This uses a two-phase reserve/commit keyed on the webhook id, with the
* unique `webhookId` primary key as the concurrency lock:
*
* - RESERVE: insert a `status="processing"` row. A unique-constraint
* violation (`P2002`) means the id is already claimed; we then inspect the
* existing row:
* - `done` → genuine duplicate → return `null` (skip).
* - `processing`, fresh → another delivery is in flight → `null`.
* - `processing`, stale → previous worker crashed → reclaim & retry.
* - COMMIT (caller, on success) → flip the row to `status="done"`.
* - RELEASE (caller, on failure) → delete the row so a retry reprocesses.
*
* Returns a {@link WebhookReservation} when the caller should process the
* delivery, or `null` when it must short-circuit (duplicate / concurrent).
*
* Fail-open: a dedupe-table error (other than P2002) never silently drops a
* webhook — we return a no-op reservation and let the work run.
*/
export async function reserveWebhook(
request: Request,
shop: string,
topic: string,
deps: DedupeDeps = { db },
): Promise<boolean> {
): Promise<WebhookReservation | null> {
// Opportunistically schedule TTL cleanup (runtime-only; never in build/CLI
// since this is reached only while handling a live webhook request).
ensureWebhookCleanupScheduled();
const webhookId = request.headers.get("x-shopify-webhook-id");
if (!webhookId) {
// Defensive: in unit tests / non-Shopify callers there is no id.
// Don't dedupe — that would silently drop legitimate calls.
return false;
// No id (unit tests / non-Shopify callers): process without dedupe.
return noopReservation(null);
}
const reservation = makeReservation(webhookId, shop, topic, deps);
try {
await deps.db.processedWebhook.create({
data: { webhookId, topic, shopDomain: shop },
data: { webhookId, topic, shopDomain: shop, status: "processing" },
});
return false;
return reservation;
} catch (err) {
// Duck-typed P2002 check so callers can stub the db without pulling
// in the real `Prisma` namespace.
if ((err as { code?: string } | null)?.code === "P2002") {
console.log(
`dedupe: skipping duplicate ${topic} delivery for ${shop} (webhookId=${webhookId})`,
);
return true;
if (!isP2002(err)) {
// Don't fail (or silently drop) a webhook on a logging-table issue.
console.warn(`dedupe: failed to reserve webhook ${webhookId} (${topic}/${shop}):`, err);
return noopReservation(webhookId);
}
// Don't fail the webhook on a logging-table issue; just process it.
console.warn(
`dedupe: failed to record webhook ${webhookId} (${topic}/${shop}):`,
err,
);
return false;
}
// A row already exists. Classify it.
let existing: ProcessedRow | null = null;
try {
existing = await deps.db.processedWebhook.findUnique({ where: { webhookId } });
} catch (err) {
console.warn(`dedupe: failed to load existing webhook ${webhookId} (${topic}/${shop}):`, err);
// Another worker owns the row and we can't classify it — be safe and skip.
return null;
}
if (!existing) {
// Raced with a release/delete between create() and findUnique(); reclaim.
return reservation;
}
if (existing.status === "done") {
console.log(
`dedupe: skipping already-processed ${topic} for ${shop} (webhookId=${webhookId})`,
);
return null;
}
const age = Date.now() - new Date(existing.receivedAt).getTime();
if (age > STALE_LEASE_MS) {
// The worker that reserved this crashed mid-process (or left a stale row).
// Renew the lease and retry the work.
try {
await deps.db.processedWebhook.update({
where: { webhookId },
data: { status: "processing", receivedAt: new Date() },
});
} catch (err) {
console.warn(`dedupe: failed to reclaim stale webhook ${webhookId}:`, err);
return null;
}
console.log(
`dedupe: reclaiming stale ${topic} reservation for ${shop} ` +
`(webhookId=${webhookId}, age=${Math.round(age / 1000)}s)`,
);
return reservation;
}
// A fresh "processing" row: another delivery is actively working on it.
// Skip this concurrent delivery. Shopify will retry; if the active worker
// fails it releases the reservation so a later retry reprocesses.
console.log(
`dedupe: ${topic} for ${shop} already in-flight (webhookId=${webhookId}); ` +
`skipping concurrent delivery`,
);
return null;
}