fix(telegram): retry polling on all transient errors, not just 409 (#1397)

A single ETIMEDOUT/ECONNRESET/DNS failure during long-polling rejected
bot.start(); the catch block returned and polling stopped permanently.
The MCP server process stayed alive (stdin keeps it running), so outbound
reply/react tools kept working — but the bot was deaf to inbound messages
until a full restart. Users see 'typing...' then nothing, indistinguishable
from the harness-side gate bug.

Now all errors retry with the same capped backoff (max 15s). attempt resets
to 0 in onStart so backoff doesn't accumulate across a long-running session.

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
Noah Zweben 2026-04-14 12:47:13 -07:00 committed by GitHub
parent 7f3389d21f
commit 7e401edac7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 21 additions and 24 deletions

View File

@ -1,7 +1,7 @@
{ {
"name": "telegram", "name": "telegram",
"description": "Telegram channel for Claude Code \u2014 messaging bridge with built-in access control. Manage pairing, allowlists, and policy via /telegram:access.", "description": "Telegram channel for Claude Code \u2014 messaging bridge with built-in access control. Manage pairing, allowlists, and policy via /telegram:access.",
"version": "0.0.5", "version": "0.0.6",
"keywords": [ "keywords": [
"telegram", "telegram",
"messaging", "messaging",

View File

@ -985,14 +985,17 @@ bot.catch(err => {
process.stderr.write(`telegram channel: handler error (polling continues): ${err.error}\n`) process.stderr.write(`telegram channel: handler error (polling continues): ${err.error}\n`)
}) })
// 409 Conflict = another getUpdates consumer is still active (zombie from a // Retry polling with backoff on any error. Previously only 409 was retried —
// previous session, or a second Claude Code instance). Retry with backoff // a single ETIMEDOUT/ECONNRESET/DNS failure rejected bot.start(), the catch
// until the slot frees up instead of crashing on the first rejection. // returned, and polling stopped permanently while the process stayed alive
// (MCP stdin keeps it running). Outbound tools kept working but the bot was
// deaf to inbound messages until a full restart.
void (async () => { void (async () => {
for (let attempt = 1; ; attempt++) { for (let attempt = 1; ; attempt++) {
try { try {
await bot.start({ await bot.start({
onStart: info => { onStart: info => {
attempt = 0
botUsername = info.username botUsername = info.username
process.stderr.write(`telegram channel: polling as @${info.username}\n`) process.stderr.write(`telegram channel: polling as @${info.username}\n`)
void bot.api.setMyCommands( void bot.api.setMyCommands(
@ -1008,8 +1011,10 @@ void (async () => {
return // bot.stop() was called — clean exit from the loop return // bot.stop() was called — clean exit from the loop
} catch (err) { } catch (err) {
if (shuttingDown) return if (shuttingDown) return
if (err instanceof GrammyError && err.error_code === 409) { // bot.stop() mid-setup rejects with grammy's "Aborted delay" — expected, not an error.
if (attempt >= 8) { if (err instanceof Error && err.message === 'Aborted delay') return
const is409 = err instanceof GrammyError && err.error_code === 409
if (is409 && attempt >= 8) {
process.stderr.write( process.stderr.write(
`telegram channel: 409 Conflict persists after ${attempt} attempts — ` + `telegram channel: 409 Conflict persists after ${attempt} attempts — ` +
`another poller is holding the bot token (stray 'bun server.ts' process or a second session). Exiting.\n`, `another poller is holding the bot token (stray 'bun server.ts' process or a second session). Exiting.\n`,
@ -1017,19 +1022,11 @@ void (async () => {
return return
} }
const delay = Math.min(1000 * attempt, 15000) const delay = Math.min(1000 * attempt, 15000)
const detail = attempt === 1 const detail = is409
? ' — another instance is polling (zombie session, or a second Claude Code running?)' ? `409 Conflict${attempt === 1 ? ' — another instance is polling (zombie session, or a second Claude Code running?)' : ''}`
: '' : `polling error: ${err}`
process.stderr.write( process.stderr.write(`telegram channel: ${detail}, retrying in ${delay / 1000}s\n`)
`telegram channel: 409 Conflict${detail}, retrying in ${delay / 1000}s\n`,
)
await new Promise(r => setTimeout(r, delay)) await new Promise(r => setTimeout(r, delay))
continue
}
// bot.stop() mid-setup rejects with grammy's "Aborted delay" — expected, not an error.
if (err instanceof Error && err.message === 'Aborted delay') return
process.stderr.write(`telegram channel: polling failed: ${err}\n`)
return
} }
} }
})() })()