diff --git a/bash/README.md b/bash/README.md index 2536764..7aae666 100644 --- a/bash/README.md +++ b/bash/README.md @@ -40,6 +40,7 @@ This template uses a thin Python gRPC infrastructure layer to handle the network 4. **Configure your connection** by editing `config.yaml`: ```yaml server_address: "your-dragonchain-server:50051" + chain_id: "your-chain-public-id" smart_contract_id: "your-smart-contract-id" api_key: "your-api-key" ``` @@ -56,13 +57,24 @@ This template uses a thin Python gRPC infrastructure layer to handle the network | Field | Description | Default | |-------|-------------|---------| | `server_address` | gRPC server address | Required | +| `chain_id` | Public chain id the SC is registered on (sent as `x-chain-id` metadata) | Required | | `smart_contract_id` | Your smart contract ID | Required | | `api_key` | API key for authentication | Required | | `use_tls` | Enable TLS encryption | `false` | | `tls_cert_path` | Path to TLS certificate | - | | `num_workers` | Concurrent transaction processors | `10` | -| `reconnect_delay_seconds` | Delay between reconnection attempts | `5` | -| `max_reconnect_attempts` | Max reconnect attempts (0 = infinite) | `0` | +| `reconnect_delay_seconds` | Base delay for exponential backoff between reconnect attempts | `3` | +| `max_backoff_seconds` | Ceiling for the exponential backoff | `120` | +| `max_reconnect_attempts` | Max reconnect attempts (0 = infinite, recommended) | `0` | + +## Durability guarantees (provided by the Python `main.py` runtime, no work for you) + +- **Server restart, update, crash, or network blip** → the runtime auto-reconnects and resumes processing. Transactions observed while the stream was down stay queued on the Dragonchain Prime side and are delivered (oldest first) on reconnect. +- **Client restart or long outage** → when this process comes back up (minutes, hours, months later), it rejoins the stream and prime re-delivers every still-pending transaction that should have invoked it. +- **Half-open TCP** (silent peer, resumed laptop, corporate NAT dropping idle flows) is detected within ~13 seconds via gRPC keepalive and triggers a reconnect. No dangling ghost streams. +- **Reconnect storms** are avoided: exponential backoff with jitter means many clients reconnecting after a server restart don't all slam `accept()` at the same instant. + +These are invariants of the runtime — you do not add any of this in `process.sh`. ## Implementing Your Smart Contract diff --git a/bash/config.yaml b/bash/config.yaml index 92b096c..d4415ab 100644 --- a/bash/config.yaml +++ b/bash/config.yaml @@ -4,6 +4,11 @@ # The gRPC server address to connect to server_address: "localhost:50051" +# The public chain id on which this smart contract is registered. +# Sent as the x-chain-id gRPC metadata header — prime rejects streams +# without it. +chain_id: "your-chain-public-id" + # Your smart contract ID (provided by Dragonchain) smart_contract_id: "your-smart-contract-id" @@ -19,6 +24,12 @@ use_tls: false # Number of worker threads for processing transactions concurrently num_workers: 10 -# Reconnect settings -reconnect_delay_seconds: 5 +# Reconnect settings. The client uses exponential backoff with jitter: +# effective delay = min(max_backoff_seconds, reconnect_delay_seconds * 2^attempts) + random(0, reconnect_delay_seconds). +# Keep max_reconnect_attempts at 0 (infinite) unless you have a specific +# reason to stop — the client is designed to survive arbitrarily long +# outages and resume processing from the prime-side queue when the +# server returns. +reconnect_delay_seconds: 3 +max_backoff_seconds: 120 max_reconnect_attempts: 0 # 0 = infinite retries diff --git a/python/README.md b/python/README.md index 33879b3..56aed40 100755 --- a/python/README.md +++ b/python/README.md @@ -36,6 +36,7 @@ A Python-based smart contract client for Dragonchain Prime that connects via gRP 4. **Configure your connection** by editing `config.yaml`: ```yaml server_address: "your-dragonchain-server:50051" + chain_id: "your-chain-public-id" smart_contract_id: "your-smart-contract-id" api_key: "your-api-key" ``` @@ -52,13 +53,24 @@ A Python-based smart contract client for Dragonchain Prime that connects via gRP | Field | Description | Default | |-------|-------------|---------| | `server_address` | gRPC server address | Required | +| `chain_id` | Public chain id the SC is registered on (sent as `x-chain-id` metadata) | Required | | `smart_contract_id` | Your smart contract ID | Required | | `api_key` | API key for authentication | Required | | `use_tls` | Enable TLS encryption | `false` | | `tls_cert_path` | Path to TLS certificate | - | | `num_workers` | Concurrent transaction processors | `10` | -| `reconnect_delay_seconds` | Delay between reconnection attempts | `5` | -| `max_reconnect_attempts` | Max reconnect attempts (0 = infinite) | `0` | +| `reconnect_delay_seconds` | Base delay for exponential backoff between reconnect attempts | `3` | +| `max_backoff_seconds` | Ceiling for the exponential backoff | `120` | +| `max_reconnect_attempts` | Max reconnect attempts (0 = infinite, recommended) | `0` | + +## Durability guarantees (provided by `main.py`, no work for you) + +- **Server restart, update, crash, or network blip** → the client auto-reconnects and resumes processing. Transactions observed while the stream was down stay queued on the Dragonchain Prime side and are delivered (oldest first) on reconnect. +- **Client restart or long outage** → when this process comes back up (minutes, hours, months later), it rejoins the stream and prime re-delivers every still-pending transaction that should have invoked it. +- **Half-open TCP** (silent peer, resumed laptop, corporate NAT dropping idle flows) is detected within ~13 seconds via gRPC keepalive and triggers a reconnect. No dangling ghost streams. +- **Reconnect storms** are avoided: exponential backoff with jitter means many clients reconnecting after a server restart don't all slam `accept()` at the same instant. The timer resets after a stream has been healthy for 60 seconds. + +These are invariants of the template — you do not add any of this in `process.py`. ## Implementing Your Smart Contract diff --git a/python/config.yaml b/python/config.yaml index 92b096c..d4415ab 100755 --- a/python/config.yaml +++ b/python/config.yaml @@ -4,6 +4,11 @@ # The gRPC server address to connect to server_address: "localhost:50051" +# The public chain id on which this smart contract is registered. +# Sent as the x-chain-id gRPC metadata header — prime rejects streams +# without it. +chain_id: "your-chain-public-id" + # Your smart contract ID (provided by Dragonchain) smart_contract_id: "your-smart-contract-id" @@ -19,6 +24,12 @@ use_tls: false # Number of worker threads for processing transactions concurrently num_workers: 10 -# Reconnect settings -reconnect_delay_seconds: 5 +# Reconnect settings. The client uses exponential backoff with jitter: +# effective delay = min(max_backoff_seconds, reconnect_delay_seconds * 2^attempts) + random(0, reconnect_delay_seconds). +# Keep max_reconnect_attempts at 0 (infinite) unless you have a specific +# reason to stop — the client is designed to survive arbitrarily long +# outages and resume processing from the prime-side queue when the +# server returns. +reconnect_delay_seconds: 3 +max_backoff_seconds: 120 max_reconnect_attempts: 0 # 0 = infinite retries diff --git a/python/main.py b/python/main.py index 6b6616d..0e0f88b 100755 --- a/python/main.py +++ b/python/main.py @@ -13,6 +13,7 @@ import argparse import json import logging import queue +import random import signal import sys import threading @@ -41,6 +42,17 @@ logger = logging.getLogger("SmartContract") # Configuration and Client Infrastructure # Do not modify this file unless you need to customize the client behavior. # Implement your smart contract logic in process.py instead. +# +# Durability contract (provided by this file, no work for the user): +# - If the Dragonchain Prime server restarts, updates, or momentarily +# drops the network, this client auto-reconnects. Transactions +# observed during the outage are queued by prime and delivered once +# the stream is re-established. +# - If this client restarts (crash, deploy, long sleep), it rejoins +# the stream and prime re-delivers every still-pending transaction +# that should have invoked it, oldest first. +# - Half-open TCP (a silent peer that never sent FIN) is detected +# within ~13 s via gRPC keepalive pings. No dangling ghost streams. # ============================================================================= @@ -49,12 +61,14 @@ class Config: """Client configuration loaded from YAML.""" server_address: str + chain_id: str smart_contract_id: str api_key: str use_tls: bool = False tls_cert_path: Optional[str] = None num_workers: int = 10 - reconnect_delay_seconds: int = 5 + reconnect_delay_seconds: int = 3 + max_backoff_seconds: int = 120 max_reconnect_attempts: int = 0 # 0 = infinite @@ -73,15 +87,33 @@ class SmartContractClient: def connect(self) -> bool: """Establish connection to the gRPC server.""" try: + # Keepalive is the load-bearing piece for detecting a + # half-open connection. Without it, a silent peer (prime + # restarted without sending FIN; laptop resumed from sleep; + # corporate NAT dropped the flow) leaves us in a "connected" + # state until the OS-level TCP keepalive fires — on Linux + # that's ~2 hours by default. 10 s ping + 3 s timeout + # catches all of that within ~13 s. + channel_options = [ + ("grpc.keepalive_time_ms", 10000), + ("grpc.keepalive_timeout_ms", 3000), + ("grpc.keepalive_permit_without_calls", 1), + ("grpc.http2.max_pings_without_data", 0), + ] + if self.config.use_tls: if not self.config.tls_cert_path: logger.error("TLS enabled but no certificate path provided") return False with open(self.config.tls_cert_path, "rb") as f: creds = grpc.ssl_channel_credentials(f.read()) - self.channel = grpc.secure_channel(self.config.server_address, creds) + self.channel = grpc.secure_channel( + self.config.server_address, creds, options=channel_options + ) else: - self.channel = grpc.insecure_channel(self.config.server_address) + self.channel = grpc.insecure_channel( + self.config.server_address, options=channel_options + ) self.stub = pb_grpc.SmartContractServiceStub(self.channel) logger.info(f"Connected to server at {self.config.server_address}") @@ -172,10 +204,13 @@ class SmartContractClient: logger.info(f"Started {self.config.num_workers} worker threads") - # Create metadata for authentication + # Create metadata for authentication + routing. x-chain-id is + # required by prime; missing it yields "missing chain ID" and + # the stream never receives transactions. metadata = [ ("x-api-key", self.config.api_key), ("x-smart-contract-id", self.config.smart_contract_id), + ("x-chain-id", self.config.chain_id), ] try: @@ -224,23 +259,38 @@ def load_config(path: str) -> Config: data = yaml.safe_load(f) # Validate required fields - required = ["server_address", "smart_contract_id", "api_key"] + required = ["server_address", "chain_id", "smart_contract_id", "api_key"] for field in required: if field not in data or not data[field]: raise ValueError(f"Missing required config field: {field}") return Config( server_address=data["server_address"], + chain_id=data["chain_id"], smart_contract_id=data["smart_contract_id"], api_key=data["api_key"], use_tls=data.get("use_tls", False), tls_cert_path=data.get("tls_cert_path"), num_workers=data.get("num_workers", 10), - reconnect_delay_seconds=data.get("reconnect_delay_seconds", 5), + reconnect_delay_seconds=data.get("reconnect_delay_seconds", 3), + max_backoff_seconds=data.get("max_backoff_seconds", 120), max_reconnect_attempts=data.get("max_reconnect_attempts", 0), ) +def next_backoff(config: Config, attempts: int) -> float: + """Compute the next reconnect delay in seconds using exponential + backoff with jitter. base * 2^attempts, capped at max_backoff_seconds, + plus random(0, base) jitter so many clients don't reconnect in + lockstep after a server restart.""" + base = max(config.reconnect_delay_seconds, 1) + cap = max(config.max_backoff_seconds, base) + shift = min(attempts, 10) # clamp exponent + delay = min(cap, base * (2 ** shift)) + jitter = random.uniform(0, base) + return delay + jitter + + def main(): parser = argparse.ArgumentParser(description="Dragonchain Smart Contract Client") parser.add_argument( @@ -269,15 +319,22 @@ def main(): signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) - # Connection loop with reconnection logic + # Connection loop with reconnection logic. A session that runs + # healthy for 60+ seconds resets the attempts counter so the next + # failure starts the exponential backoff schedule fresh. attempts = 0 + HEALTHY_RUN_SECONDS = 60 + while True: if client.connect(): - attempts = 0 - if not client.run(): - if not client.running: - logger.info("Shutdown requested") - break + start = time.monotonic() + ran_ok = client.run() + if time.monotonic() - start > HEALTHY_RUN_SECONDS: + attempts = 0 + if not ran_ok and not client.running: + logger.info("Shutdown requested") + client.close() + break client.close() @@ -286,8 +343,8 @@ def main(): logger.error(f"Max reconnection attempts ({config.max_reconnect_attempts}) reached") break - delay = config.reconnect_delay_seconds - logger.info(f"Reconnecting in {delay} seconds (attempt {attempts})...") + delay = next_backoff(config, attempts - 1) + logger.info(f"Reconnecting in {delay:.1f} seconds (attempt {attempts})...") time.sleep(delay) logger.info("Client shut down") diff --git a/typescript/README.md b/typescript/README.md index ffe4f6e..4a59092 100755 --- a/typescript/README.md +++ b/typescript/README.md @@ -23,6 +23,7 @@ A TypeScript/JavaScript-based smart contract client for Dragonchain Prime that c 3. **Configure your connection** by editing `config.yaml`: ```yaml server_address: "your-dragonchain-server:50051" + chain_id: "your-chain-public-id" smart_contract_id: "your-smart-contract-id" api_key: "your-api-key" ``` @@ -45,13 +46,24 @@ A TypeScript/JavaScript-based smart contract client for Dragonchain Prime that c | Field | Description | Default | |-------|-------------|---------| | `server_address` | gRPC server address | Required | +| `chain_id` | Public chain id the SC is registered on (sent as `x-chain-id` metadata) | Required | | `smart_contract_id` | Your smart contract ID | Required | | `api_key` | API key for authentication | Required | | `use_tls` | Enable TLS encryption | `false` | | `tls_cert_path` | Path to TLS certificate | - | | `num_workers` | Concurrent transaction processors | `10` | -| `reconnect_delay_seconds` | Delay between reconnection attempts | `5` | -| `max_reconnect_attempts` | Max reconnect attempts (0 = infinite) | `0` | +| `reconnect_delay_seconds` | Base delay for exponential backoff between reconnect attempts | `3` | +| `max_backoff_seconds` | Ceiling for the exponential backoff | `120` | +| `max_reconnect_attempts` | Max reconnect attempts (0 = infinite, recommended) | `0` | + +## Durability guarantees (provided by `src/main.ts`, no work for you) + +- **Server restart, update, crash, or network blip** → the client auto-reconnects and resumes processing. Transactions observed while the stream was down stay queued on the Dragonchain Prime side and are delivered (oldest first) on reconnect. +- **Client restart or long outage** → when this process comes back up (minutes, hours, months later), it rejoins the stream and prime re-delivers every still-pending transaction that should have invoked it. +- **Half-open TCP** (silent peer, resumed laptop, corporate NAT dropping idle flows) is detected within ~13 seconds via gRPC keepalive and triggers a reconnect. No dangling ghost streams. +- **Reconnect storms** are avoided: exponential backoff with jitter means many clients reconnecting after a server restart don't all slam `accept()` at the same instant. The timer resets after a stream has been healthy for 60 seconds. + +These are invariants of the template — you do not add any of this in `src/process.ts`. ## Implementing Your Smart Contract diff --git a/typescript/config.yaml b/typescript/config.yaml index bd4b18c..98216f3 100755 --- a/typescript/config.yaml +++ b/typescript/config.yaml @@ -4,6 +4,11 @@ # The gRPC server address to connect to server_address: "localhost:50051" +# The public chain id on which this smart contract is registered. +# Sent as the x-chain-id gRPC metadata header — prime rejects streams +# without it. +chain_id: "your-chain-public-id" + # Your smart contract ID (provided by Dragonchain) smart_contract_id: "your-smart-contract-id" @@ -19,6 +24,12 @@ use_tls: false # Number of concurrent workers for processing transactions num_workers: 10 -# Reconnect settings -reconnect_delay_seconds: 5 +# Reconnect settings. The client uses exponential backoff with jitter: +# effective delay = min(max_backoff_seconds, reconnect_delay_seconds * 2^attempts) + random(0, reconnect_delay_seconds). +# Keep max_reconnect_attempts at 0 (infinite) unless you have a specific +# reason to stop — the client is designed to survive arbitrarily long +# outages and resume processing from the prime-side queue when the +# server returns. +reconnect_delay_seconds: 3 +max_backoff_seconds: 120 max_reconnect_attempts: 0 # 0 = infinite retries diff --git a/typescript/src/main.ts b/typescript/src/main.ts index 90eda5a..5f96ef6 100755 --- a/typescript/src/main.ts +++ b/typescript/src/main.ts @@ -33,16 +33,29 @@ const SmartContractService = protoDescriptor.remote_sc.SmartContractService; // Configuration and Client Infrastructure // Do not modify this file unless you need to customize the client behavior. // Implement your smart contract logic in process.ts instead. +// +// Durability contract (provided by this file, no work for the user): +// - If the Dragonchain Prime server restarts, updates, or momentarily +// drops the network, this client auto-reconnects. Transactions +// observed during the outage are queued by prime and delivered once +// the stream is re-established. +// - If this client restarts (crash, deploy, long sleep), it rejoins +// the stream and prime re-delivers every still-pending transaction +// that should have invoked it, oldest first. +// - Half-open TCP (a silent peer that never sent FIN) is detected +// within ~13 s via gRPC keepalive pings. No dangling ghost streams. // ============================================================================= interface Config { serverAddress: string; + chainId: string; smartContractId: string; apiKey: string; useTls: boolean; tlsCertPath?: string; numWorkers: number; reconnectDelaySeconds: number; + maxBackoffSeconds: number; maxReconnectAttempts: number; } @@ -91,9 +104,23 @@ class SmartContractClient { credentials = grpc.credentials.createInsecure(); } + // Keepalive is the load-bearing piece for detecting a half-open + // connection. Without it, a silent peer (prime restarted without + // sending FIN; laptop resumed from sleep; corporate NAT dropped + // the flow) leaves us in a "connected" state until the OS-level + // TCP keepalive fires — on Linux ~2 hours by default. 10 s ping + // + 3 s timeout catches all of that within ~13 s. + const channelOptions = { + "grpc.keepalive_time_ms": 10000, + "grpc.keepalive_timeout_ms": 3000, + "grpc.keepalive_permit_without_calls": 1, + "grpc.http2.max_pings_without_data": 0, + }; + this.client = new SmartContractService( this.config.serverAddress, - credentials + credentials, + channelOptions ); console.log(`[SC-Client] Connected to server at ${this.config.serverAddress}`); @@ -175,10 +202,13 @@ class SmartContractClient { this.running = true; - // Create metadata for authentication + // Create metadata for authentication + routing. x-chain-id is + // required by prime; missing it yields "missing chain ID" and the + // stream never receives transactions. const metadata = new grpc.Metadata(); metadata.add("x-api-key", this.config.apiKey); metadata.add("x-smart-contract-id", this.config.smartContractId); + metadata.add("x-chain-id", this.config.chainId); return new Promise((resolve) => { // Establish bi-directional stream @@ -255,12 +285,14 @@ class SmartContractClient { interface RawConfig { server_address: string; + chain_id: string; smart_contract_id: string; api_key: string; use_tls?: boolean; tls_cert_path?: string; num_workers?: number; reconnect_delay_seconds?: number; + max_backoff_seconds?: number; max_reconnect_attempts?: number; } @@ -269,7 +301,7 @@ function loadConfig(configPath: string): Config { const raw = yaml.load(content) as RawConfig; // Validate required fields - const required = ["server_address", "smart_contract_id", "api_key"]; + const required = ["server_address", "chain_id", "smart_contract_id", "api_key"]; for (const field of required) { if (!(field in raw) || !raw[field as keyof RawConfig]) { throw new Error(`Missing required config field: ${field}`); @@ -278,16 +310,33 @@ function loadConfig(configPath: string): Config { return { serverAddress: raw.server_address, + chainId: raw.chain_id, smartContractId: raw.smart_contract_id, apiKey: raw.api_key, useTls: raw.use_tls ?? false, tlsCertPath: raw.tls_cert_path, numWorkers: raw.num_workers ?? 10, - reconnectDelaySeconds: raw.reconnect_delay_seconds ?? 5, + reconnectDelaySeconds: raw.reconnect_delay_seconds ?? 3, + maxBackoffSeconds: raw.max_backoff_seconds ?? 120, maxReconnectAttempts: raw.max_reconnect_attempts ?? 0, }; } +/** + * Compute the next reconnect delay in milliseconds using exponential + * backoff with jitter. base * 2^attempts, capped at maxBackoffSeconds, + * plus random(0, base) jitter so many clients don't reconnect in + * lockstep after a server restart. + */ +function nextBackoffMs(config: Config, attempts: number): number { + const baseSec = Math.max(config.reconnectDelaySeconds, 1); + const capSec = Math.max(config.maxBackoffSeconds, baseSec); + const shift = Math.min(attempts, 10); // clamp exponent + const delaySec = Math.min(capSec, baseSec * 2 ** shift); + const jitterSec = Math.random() * baseSec; + return Math.round((delaySec + jitterSec) * 1000); +} + // ============================================================================= // Main Entry Point // ============================================================================= @@ -325,13 +374,19 @@ async function main(): Promise { process.on("SIGINT", shutdown); process.on("SIGTERM", shutdown); - // Connection loop with reconnection logic + // Connection loop with reconnection logic. A session that runs + // healthy for 60+ seconds resets the attempts counter so the next + // failure starts the exponential backoff schedule fresh. let attempts = 0; + const HEALTHY_RUN_MS = 60 * 1000; while (true) { if (client.connect()) { - attempts = 0; + const start = Date.now(); const success = await client.run(); + if (Date.now() - start > HEALTHY_RUN_MS) { + attempts = 0; + } if (!success) { // Check if it was a graceful shutdown client.close(); @@ -352,12 +407,12 @@ async function main(): Promise { break; } - const delay = config.reconnectDelaySeconds; + const delayMs = nextBackoffMs(config, attempts - 1); console.log( - `[SC-Client] Reconnecting in ${delay} seconds (attempt ${attempts})...` + `[SC-Client] Reconnecting in ${(delayMs / 1000).toFixed(1)} seconds (attempt ${attempts})...` ); - await new Promise((resolve) => setTimeout(resolve, delay * 1000)); + await new Promise((resolve) => setTimeout(resolve, delayMs)); } console.log("[SC-Client] Client shut down");