security hardening
This commit is contained in:
@@ -1,27 +1,118 @@
|
||||
import pLimit from "p-limit";
|
||||
import type { WebhookReservation } from "./dedupe.server";
|
||||
|
||||
/**
|
||||
* Fire-and-forget runner for webhook side-effects.
|
||||
* Background runner for webhook side-effects.
|
||||
*
|
||||
* Shopify expects a 200 response within ~5 seconds, otherwise it considers
|
||||
* the delivery failed and retries it. Heavy automation work (PDF render,
|
||||
* Shopify Files upload, SMTP send) routinely exceeded that budget, which
|
||||
* caused duplicate invoice emails before we added the dedupe table.
|
||||
*
|
||||
* Returning the response immediately and letting the work finish in the
|
||||
* background keeps Shopify happy. Combined with the dedupe table this is
|
||||
* defence-in-depth: dedupe ensures *correctness* even if a retry sneaks
|
||||
* through, while async processing makes retries unlikely in the first
|
||||
* place.
|
||||
* Returning the response immediately and finishing the work afterwards keeps
|
||||
* Shopify happy. Two problems with a naive `void work()`:
|
||||
*
|
||||
* Errors are caught and logged \u2014 they cannot reach a dispatcher because
|
||||
* the HTTP response is already gone.
|
||||
* 1. DoS / resource exhaustion — an order burst would spawn unbounded
|
||||
* concurrent PDF renders + SMTP sends. We cap concurrency with a small
|
||||
* in-process queue (`p-limit`); excess tasks queue instead of piling up.
|
||||
* 2. Data loss on restart — `void work()` is invisible to shutdown, so a
|
||||
* container stop (SIGTERM) killed in-flight invoice work mid-send. We
|
||||
* track in-flight tasks and drain them (bounded) on SIGTERM/SIGINT.
|
||||
*
|
||||
* Reserve/commit dedupe (see dedupe.server.ts) is integrated here: on success
|
||||
* we `commit()` the reservation (permanently deduped); on failure we
|
||||
* `release()` it so Shopify's retry re-runs the work instead of being dropped
|
||||
* as a duplicate.
|
||||
*/
|
||||
|
||||
const CONCURRENCY = Math.max(1, Number(process.env.WEBHOOK_CONCURRENCY) || 4);
|
||||
const DRAIN_TIMEOUT_MS = Math.max(
|
||||
1000,
|
||||
Number(process.env.WEBHOOK_DRAIN_TIMEOUT_MS) || 25_000,
|
||||
);
|
||||
|
||||
const limit = pLimit(CONCURRENCY);
|
||||
const inFlight = new Set<Promise<unknown>>();
|
||||
let draining = false;
|
||||
|
||||
export function runWebhookInBackground(
|
||||
description: string,
|
||||
work: () => Promise<unknown>,
|
||||
reservation?: WebhookReservation | null,
|
||||
): void {
|
||||
// `void` so we don't accidentally `await` the floating promise; the
|
||||
// node event loop keeps the task alive until it settles.
|
||||
void work().catch((err) => {
|
||||
console.error(`background webhook task '${description}' failed:`, err);
|
||||
if (draining) {
|
||||
// The process is shutting down. We still enqueue so the drain awaits this
|
||||
// task — the server has already stopped listening, so this is at most the
|
||||
// tail end of the last accepted request.
|
||||
console.warn(`[webhook-queue] enqueuing task during shutdown drain: ${description}`);
|
||||
}
|
||||
|
||||
const task = limit(async () => {
|
||||
try {
|
||||
await work();
|
||||
await reservation?.commit();
|
||||
} catch (err) {
|
||||
console.error(`background webhook task '${description}' failed:`, err);
|
||||
// Drop the dedupe reservation so Shopify's retry re-runs the work.
|
||||
try {
|
||||
await reservation?.release();
|
||||
} catch (releaseErr) {
|
||||
console.error(
|
||||
`background webhook task '${description}': failed to release dedupe reservation:`,
|
||||
releaseErr,
|
||||
);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
inFlight.add(task);
|
||||
void task.finally(() => inFlight.delete(task));
|
||||
}
|
||||
|
||||
/**
|
||||
* Stop accepting new work (best-effort) and await in-flight + queued tasks,
|
||||
* bounded by `timeoutMs`, so a container stop drains invoice work instead of
|
||||
* killing it mid-send. Idempotent.
|
||||
*/
|
||||
export async function drainWebhookQueue(timeoutMs = DRAIN_TIMEOUT_MS): Promise<void> {
|
||||
draining = true;
|
||||
if (inFlight.size === 0) return;
|
||||
|
||||
console.log(
|
||||
`[webhook-queue] draining ${inFlight.size} in-flight webhook task(s) (timeout ${timeoutMs}ms)...`,
|
||||
);
|
||||
|
||||
let timer: ReturnType<typeof setTimeout> | undefined;
|
||||
const timeout = new Promise<void>((resolve) => {
|
||||
timer = setTimeout(resolve, timeoutMs);
|
||||
if (typeof timer.unref === "function") timer.unref();
|
||||
});
|
||||
|
||||
await Promise.race([Promise.allSettled([...inFlight]), timeout]);
|
||||
if (timer) clearTimeout(timer);
|
||||
|
||||
if (inFlight.size > 0) {
|
||||
console.warn(
|
||||
`[webhook-queue] drain timed out with ${inFlight.size} task(s) still running`,
|
||||
);
|
||||
} else {
|
||||
console.log("[webhook-queue] drain complete");
|
||||
}
|
||||
}
|
||||
|
||||
// Bridge for the custom server (server.js), which loads only the bundled
|
||||
// build and cannot import this module directly. It awaits this drain before
|
||||
// calling process.exit during graceful shutdown.
|
||||
type DrainGlobal = typeof globalThis & {
|
||||
__linumiqWebhookDrain?: typeof drainWebhookQueue;
|
||||
};
|
||||
(globalThis as DrainGlobal).__linumiqWebhookDrain = drainWebhookQueue;
|
||||
|
||||
// Safety net for runtimes that don't go through server.js (e.g. `shopify app
|
||||
// dev`): stop accepting work and best-effort drain. The custom server awaits
|
||||
// the same (idempotent) drain before exiting.
|
||||
for (const signal of ["SIGTERM", "SIGINT"] as const) {
|
||||
process.once(signal, () => {
|
||||
void drainWebhookQueue();
|
||||
});
|
||||
}
|
||||
|
||||
@@ -0,0 +1,63 @@
|
||||
import db from "../../db.server";
|
||||
|
||||
/**
|
||||
* Periodic TTL cleanup for the `ProcessedWebhook` idempotency table.
|
||||
*
|
||||
* The table grows by one row per Shopify webhook delivery and is never read
|
||||
* after the retry window closes, so without pruning it grows unbounded —
|
||||
* eventually a disk/space DoS. We only need rows for as long as Shopify might
|
||||
* retry a delivery (hours), so a generous retention window of a few days is
|
||||
* ample while keeping the table small.
|
||||
*/
|
||||
const RETENTION_MS = 7 * 24 * 60 * 60 * 1000; // 7 days
|
||||
const INTERVAL_MS = 60 * 60 * 1000; // hourly
|
||||
|
||||
export interface CleanupDeps {
|
||||
db: {
|
||||
processedWebhook: {
|
||||
deleteMany: (args: {
|
||||
where: { receivedAt: { lt: Date } };
|
||||
}) => Promise<{ count: number }>;
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
let scheduled = false;
|
||||
|
||||
async function runCleanup(deps: CleanupDeps): Promise<void> {
|
||||
try {
|
||||
const cutoff = new Date(Date.now() - RETENTION_MS);
|
||||
const { count } = await deps.db.processedWebhook.deleteMany({
|
||||
where: { receivedAt: { lt: cutoff } },
|
||||
});
|
||||
if (count > 0) {
|
||||
console.log(`webhook-cleanup: removed ${count} ProcessedWebhook row(s) older than 7d`);
|
||||
}
|
||||
} catch (err) {
|
||||
// Best-effort housekeeping — never throw into the caller.
|
||||
console.warn("webhook-cleanup: prune failed:", err);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Idempotently schedule the hourly cleanup. Safe to call on every webhook —
|
||||
* the first call starts a single unref'd interval and runs an immediate
|
||||
* sweep; subsequent calls are no-ops.
|
||||
*
|
||||
* Because this is only ever invoked while handling a live webhook request, it
|
||||
* never runs during `prisma generate` / `react-router build` or other CLI
|
||||
* contexts. The interval is `unref`'d so it can never keep the process alive.
|
||||
*/
|
||||
export function ensureWebhookCleanupScheduled(deps: CleanupDeps = { db }): void {
|
||||
if (scheduled) return;
|
||||
scheduled = true;
|
||||
|
||||
const timer = setInterval(() => {
|
||||
void runCleanup(deps);
|
||||
}, INTERVAL_MS);
|
||||
// Don't let the housekeeping interval keep the event loop alive on shutdown.
|
||||
if (typeof timer.unref === "function") timer.unref();
|
||||
|
||||
// Kick off an immediate sweep so a long-lived process prunes promptly.
|
||||
void runCleanup(deps);
|
||||
}
|
||||
@@ -1,67 +1,204 @@
|
||||
import db from "../../db.server";
|
||||
import { ensureWebhookCleanupScheduled } from "./cleanup.server";
|
||||
|
||||
/**
|
||||
* Minimal shape of the Prisma client surface we use — declared inline so
|
||||
* the helper can be unit-tested with a tiny stub instead of pulling in a
|
||||
* real database.
|
||||
* How long a `status="processing"` reservation is considered "live" before we
|
||||
* assume the worker that claimed it crashed mid-process. After this window a
|
||||
* stale reservation may be reclaimed and the work retried.
|
||||
*/
|
||||
const STALE_LEASE_MS = 5 * 60 * 1000; // 5 minutes
|
||||
|
||||
interface ProcessedRow {
|
||||
webhookId: string;
|
||||
status: string;
|
||||
receivedAt: Date;
|
||||
}
|
||||
|
||||
/**
|
||||
* Minimal shape of the Prisma client surface we use — declared inline so the
|
||||
* helper can be unit-tested with a tiny stub instead of a real database.
|
||||
*/
|
||||
export interface DedupeDeps {
|
||||
db: {
|
||||
processedWebhook: {
|
||||
create: (args: {
|
||||
data: { webhookId: string; topic: string; shopDomain: string };
|
||||
data: { webhookId: string; topic: string; shopDomain: string; status: string };
|
||||
}) => Promise<unknown>;
|
||||
findUnique: (args: { where: { webhookId: string } }) => Promise<ProcessedRow | null>;
|
||||
update: (args: {
|
||||
where: { webhookId: string };
|
||||
data: { status?: string; receivedAt?: Date };
|
||||
}) => Promise<unknown>;
|
||||
delete: (args: { where: { webhookId: string } }) => Promise<unknown>;
|
||||
};
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns `true` when this Shopify webhook delivery has already been
|
||||
* processed and the caller should short-circuit without doing the work.
|
||||
* A claim on a single Shopify webhook delivery. Obtained from
|
||||
* {@link reserveWebhook}. The caller MUST eventually `commit()` (work
|
||||
* succeeded — the delivery is permanently deduped) or `release()` (work
|
||||
* failed — drop the reservation so Shopify's retry re-runs the work).
|
||||
*
|
||||
* Shopify retries webhook deliveries when it doesn't receive a 200 within
|
||||
* its (~5s) timeout window. Without dedupe this caused us to email an
|
||||
* invoice twice for the same order: the first slow delivery completed its
|
||||
* work but Shopify timed out and re-sent the webhook, which then ran the
|
||||
* automation a second time.
|
||||
*
|
||||
* We key on the `X-Shopify-Webhook-Id` header — Shopify guarantees the same
|
||||
* value for retries of the same delivery, but a new value for genuinely
|
||||
* new events. The insert is the lock: a unique-constraint violation
|
||||
* (Prisma error code `P2002`) means another delivery already claimed this
|
||||
* id.
|
||||
* `commit`/`release` are no-ops for reservations without a webhook id (unit
|
||||
* tests / non-Shopify callers) and for the fail-open path.
|
||||
*/
|
||||
export async function isDuplicateWebhook(
|
||||
export interface WebhookReservation {
|
||||
webhookId: string | null;
|
||||
commit: () => Promise<void>;
|
||||
release: () => Promise<void>;
|
||||
}
|
||||
|
||||
function noopReservation(webhookId: string | null): WebhookReservation {
|
||||
return {
|
||||
webhookId,
|
||||
commit: async () => {},
|
||||
release: async () => {},
|
||||
};
|
||||
}
|
||||
|
||||
function isP2002(err: unknown): boolean {
|
||||
// Duck-typed so callers can stub the db without pulling in the real
|
||||
// `Prisma` namespace. P2002 = unique-constraint violation.
|
||||
return (err as { code?: string } | null)?.code === "P2002";
|
||||
}
|
||||
|
||||
function makeReservation(
|
||||
webhookId: string,
|
||||
shop: string,
|
||||
topic: string,
|
||||
deps: DedupeDeps,
|
||||
): WebhookReservation {
|
||||
return {
|
||||
webhookId,
|
||||
commit: async () => {
|
||||
try {
|
||||
await deps.db.processedWebhook.update({
|
||||
where: { webhookId },
|
||||
data: { status: "done" },
|
||||
});
|
||||
} catch (err) {
|
||||
// The work already succeeded; a failed commit just risks a later
|
||||
// duplicate (which the side-effect code is expected to tolerate).
|
||||
console.warn(`dedupe: failed to commit webhook ${webhookId} (${topic}/${shop}):`, err);
|
||||
}
|
||||
},
|
||||
release: async () => {
|
||||
try {
|
||||
await deps.db.processedWebhook.delete({ where: { webhookId } });
|
||||
} catch (err) {
|
||||
console.warn(`dedupe: failed to release webhook ${webhookId} (${topic}/${shop}):`, err);
|
||||
}
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Reserve this Shopify webhook delivery for processing.
|
||||
*
|
||||
* Shopify retries a delivery (re-using the same `X-Shopify-Webhook-Id`) when
|
||||
* it doesn't receive a 200 within its ~5s timeout. Naively recording the id as
|
||||
* "processed" *before* doing the work meant that if the heavy background work
|
||||
* later failed (SMTP/GraphQL/PDF error), Shopify's retry was dropped as a
|
||||
* duplicate and the invoice was never sent.
|
||||
*
|
||||
* This uses a two-phase reserve/commit keyed on the webhook id, with the
|
||||
* unique `webhookId` primary key as the concurrency lock:
|
||||
*
|
||||
* - RESERVE: insert a `status="processing"` row. A unique-constraint
|
||||
* violation (`P2002`) means the id is already claimed; we then inspect the
|
||||
* existing row:
|
||||
* - `done` → genuine duplicate → return `null` (skip).
|
||||
* - `processing`, fresh → another delivery is in flight → `null`.
|
||||
* - `processing`, stale → previous worker crashed → reclaim & retry.
|
||||
* - COMMIT (caller, on success) → flip the row to `status="done"`.
|
||||
* - RELEASE (caller, on failure) → delete the row so a retry reprocesses.
|
||||
*
|
||||
* Returns a {@link WebhookReservation} when the caller should process the
|
||||
* delivery, or `null` when it must short-circuit (duplicate / concurrent).
|
||||
*
|
||||
* Fail-open: a dedupe-table error (other than P2002) never silently drops a
|
||||
* webhook — we return a no-op reservation and let the work run.
|
||||
*/
|
||||
export async function reserveWebhook(
|
||||
request: Request,
|
||||
shop: string,
|
||||
topic: string,
|
||||
deps: DedupeDeps = { db },
|
||||
): Promise<boolean> {
|
||||
): Promise<WebhookReservation | null> {
|
||||
// Opportunistically schedule TTL cleanup (runtime-only; never in build/CLI
|
||||
// since this is reached only while handling a live webhook request).
|
||||
ensureWebhookCleanupScheduled();
|
||||
|
||||
const webhookId = request.headers.get("x-shopify-webhook-id");
|
||||
if (!webhookId) {
|
||||
// Defensive: in unit tests / non-Shopify callers there is no id.
|
||||
// Don't dedupe — that would silently drop legitimate calls.
|
||||
return false;
|
||||
// No id (unit tests / non-Shopify callers): process without dedupe.
|
||||
return noopReservation(null);
|
||||
}
|
||||
|
||||
const reservation = makeReservation(webhookId, shop, topic, deps);
|
||||
|
||||
try {
|
||||
await deps.db.processedWebhook.create({
|
||||
data: { webhookId, topic, shopDomain: shop },
|
||||
data: { webhookId, topic, shopDomain: shop, status: "processing" },
|
||||
});
|
||||
return false;
|
||||
return reservation;
|
||||
} catch (err) {
|
||||
// Duck-typed P2002 check so callers can stub the db without pulling
|
||||
// in the real `Prisma` namespace.
|
||||
if ((err as { code?: string } | null)?.code === "P2002") {
|
||||
console.log(
|
||||
`dedupe: skipping duplicate ${topic} delivery for ${shop} (webhookId=${webhookId})`,
|
||||
);
|
||||
return true;
|
||||
if (!isP2002(err)) {
|
||||
// Don't fail (or silently drop) a webhook on a logging-table issue.
|
||||
console.warn(`dedupe: failed to reserve webhook ${webhookId} (${topic}/${shop}):`, err);
|
||||
return noopReservation(webhookId);
|
||||
}
|
||||
// Don't fail the webhook on a logging-table issue; just process it.
|
||||
console.warn(
|
||||
`dedupe: failed to record webhook ${webhookId} (${topic}/${shop}):`,
|
||||
err,
|
||||
);
|
||||
return false;
|
||||
}
|
||||
|
||||
// A row already exists. Classify it.
|
||||
let existing: ProcessedRow | null = null;
|
||||
try {
|
||||
existing = await deps.db.processedWebhook.findUnique({ where: { webhookId } });
|
||||
} catch (err) {
|
||||
console.warn(`dedupe: failed to load existing webhook ${webhookId} (${topic}/${shop}):`, err);
|
||||
// Another worker owns the row and we can't classify it — be safe and skip.
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!existing) {
|
||||
// Raced with a release/delete between create() and findUnique(); reclaim.
|
||||
return reservation;
|
||||
}
|
||||
|
||||
if (existing.status === "done") {
|
||||
console.log(
|
||||
`dedupe: skipping already-processed ${topic} for ${shop} (webhookId=${webhookId})`,
|
||||
);
|
||||
return null;
|
||||
}
|
||||
|
||||
const age = Date.now() - new Date(existing.receivedAt).getTime();
|
||||
if (age > STALE_LEASE_MS) {
|
||||
// The worker that reserved this crashed mid-process (or left a stale row).
|
||||
// Renew the lease and retry the work.
|
||||
try {
|
||||
await deps.db.processedWebhook.update({
|
||||
where: { webhookId },
|
||||
data: { status: "processing", receivedAt: new Date() },
|
||||
});
|
||||
} catch (err) {
|
||||
console.warn(`dedupe: failed to reclaim stale webhook ${webhookId}:`, err);
|
||||
return null;
|
||||
}
|
||||
console.log(
|
||||
`dedupe: reclaiming stale ${topic} reservation for ${shop} ` +
|
||||
`(webhookId=${webhookId}, age=${Math.round(age / 1000)}s)`,
|
||||
);
|
||||
return reservation;
|
||||
}
|
||||
|
||||
// A fresh "processing" row: another delivery is actively working on it.
|
||||
// Skip this concurrent delivery. Shopify will retry; if the active worker
|
||||
// fails it releases the reservation so a later retry reprocesses.
|
||||
console.log(
|
||||
`dedupe: ${topic} for ${shop} already in-flight (webhookId=${webhookId}); ` +
|
||||
`skipping concurrent delivery`,
|
||||
);
|
||||
return null;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user