sc-templates/go/main.go

package main

import (
	"context"
	"encoding/json"
	"flag"
	"fmt"
	"io"
	"log"
	"math/rand"
	"os"
	"os/signal"
	"sync"
	"syscall"
	"time"

	pb "github.com/your-org/smart-contract/proto"
	"google.golang.org/grpc"
	"google.golang.org/grpc/credentials"
	"google.golang.org/grpc/credentials/insecure"
	"google.golang.org/grpc/keepalive"
	"google.golang.org/grpc/metadata"
	"gopkg.in/yaml.v3"
)

// =============================================================================
// Configuration and Client Infrastructure
// Do not modify this file unless you need to customize the client behavior.
// Implement your smart contract logic in process.go instead.
//
// Durability contract (provided by this file, no work for the user):
//   - If the Dragonchain Prime server restarts, updates, or momentarily
//     drops the network, this client auto-reconnects. Transactions
//     observed during the outage are queued by prime and delivered once
//     the stream is re-established.
//   - If this client restarts (crash, deploy, long sleep), it rejoins
//     the stream and prime re-delivers every still-pending transaction
//     that should have invoked it, oldest first.
//   - Half-open TCP (a silent peer that never sent FIN) is detected
//     within ~13 s via gRPC keepalive pings. No dangling ghost streams.
// =============================================================================

// Config holds the client configuration loaded from YAML
type Config struct {
	ServerAddress   string `yaml:"server_address"`
	ChainID         string `yaml:"chain_id"`
	SmartContractID string `yaml:"smart_contract_id"`
	APIKey          string `yaml:"api_key"`
	UseTLS          bool   `yaml:"use_tls"`
	TLSCertPath     string `yaml:"tls_cert_path"`
	NumWorkers      int    `yaml:"num_workers"`

	// ReconnectDelaySecs is the BASE backoff between reconnect attempts.
	// The effective delay is `base * 2^attempts + jitter` capped at
	// MaxBackoffSeconds — so repeated failures back off, but a clean
	// server restart is picked up within a few seconds.
	ReconnectDelaySecs int `yaml:"reconnect_delay_seconds"`

	// MaxBackoffSeconds caps the exponential backoff. Default 120.
	MaxBackoffSeconds int `yaml:"max_backoff_seconds"`

	// MaxReconnectAttempts: 0 = infinite (default and recommended — the
	// whole point of this client is to stay available indefinitely).
	MaxReconnectAttempts int `yaml:"max_reconnect_attempts"`
}

// Client manages the gRPC connection and request processing
type Client struct {
	config     *Config
	conn       *grpc.ClientConn
	grpcClient pb.SmartContractServiceClient
	workChan   chan *pb.SmartContractRequest
	wg         sync.WaitGroup
	logger     *log.Logger
}

// NewClient creates a new smart contract client
func NewClient(config *Config) *Client {
	return &Client{
		config:   config,
		workChan: make(chan *pb.SmartContractRequest, config.NumWorkers*2),
		logger:   log.New(os.Stdout, "[SC-Client] ", log.LstdFlags|log.Lmicroseconds),
	}
}

// Connect establishes a connection to the gRPC server
func (c *Client) Connect() error {
	var opts []grpc.DialOption

	if c.config.UseTLS {
		creds, err := credentials.NewClientTLSFromFile(c.config.TLSCertPath, "")
		if err != nil {
			return fmt.Errorf("failed to load TLS credentials: %w", err)
		}
		opts = append(opts, grpc.WithTransportCredentials(creds))
	} else {
		opts = append(opts, grpc.WithTransportCredentials(insecure.NewCredentials()))
	}

	// Keepalive is the load-bearing piece for detecting a half-open
	// connection. Without it, a silent peer (prime restarted without
	// sending FIN; laptop resumed from sleep; corporate NAT dropped the
	// flow) leaves us in a "connected" state until the OS-level TCP
	// keepalive eventually fires — which on Linux defaults to ~2 hours.
	// 10 s ping + 3 s timeout catches all of that within ~13 s.
	opts = append(opts, grpc.WithKeepaliveParams(keepalive.ClientParameters{
		Time:                10 * time.Second,
		Timeout:             3 * time.Second,
		PermitWithoutStream: true,
	}))

	conn, err := grpc.NewClient(c.config.ServerAddress, opts...)
	if err != nil {
		return fmt.Errorf("failed to connect to server: %w", err)
	}

	c.conn = conn
	c.grpcClient = pb.NewSmartContractServiceClient(conn)
	c.logger.Printf("Connected to server at %s", c.config.ServerAddress)
	return nil
}

// Close closes the gRPC connection
func (c *Client) Close() error {
	if c.conn != nil {
		return c.conn.Close()
	}
	return nil
}

// Run starts the client and processes incoming requests. It returns
// when the stream terminates for any reason (server close, network
// error, ctx cancellation). The outer reconnect loop in main() calls
// Run again after a backoff.
func (c *Client) Run(ctx context.Context) error {
	// Wrap ctx with our own cancel so the sender goroutine can tear down
	// the stream on Send errors — otherwise stream.Recv() in the main
	// loop could block forever waiting for a peer that is never coming
	// back. Any cancel from here propagates to both directions of the
	// bidi stream.
	streamCtx, streamCancel := context.WithCancel(ctx)
	defer streamCancel()

	// Auth + routing metadata. x-chain-id is required by the server; a
	// missing header yields "missing chain ID" from prime and no
	// transactions will arrive.
	md := metadata.Pairs(
		"x-api-key", c.config.APIKey,
		"x-smart-contract-id", c.config.SmartContractID,
		"x-chain-id", c.config.ChainID,
	)
	streamCtx = metadata.NewOutgoingContext(streamCtx, md)

	// Establish the bi-directional stream
	stream, err := c.grpcClient.Run(streamCtx)
	if err != nil {
		return fmt.Errorf("failed to establish stream: %w", err)
	}

	c.logger.Printf("Stream established, starting %d workers", c.config.NumWorkers)

	// Channel to collect responses from workers
	responseChan := make(chan *pb.SmartContractResponse, c.config.NumWorkers*2)
	errChan := make(chan error, 2)

	// Start worker goroutines
	for i := 0; i < c.config.NumWorkers; i++ {
		c.wg.Add(1)
		go c.worker(streamCtx, responseChan)
	}

	// Sender: forwards worker responses back to the server. Any Send
	// error immediately cancels streamCtx so the Recv loop below exits
	// instead of blocking forever.
	go func() {
		for resp := range responseChan {
			if err := stream.Send(resp); err != nil {
				c.logger.Printf("Error sending response: %v", err)
				select {
				case errChan <- err:
				default:
				}
				streamCancel()
				return
			}
		}
	}()

	// Main loop: receive requests and dispatch to workers. stream.Recv
	// returns when the peer closes the stream, when streamCtx is cancelled
	// (e.g. because the sender goroutine hit an error), or on a real
	// transport error.
	var recvErr error
	for {
		req, err := stream.Recv()
		if err == io.EOF {
			c.logger.Println("Server closed the stream")
			break
		}
		if err != nil {
			recvErr = err
			break
		}

		c.logger.Printf("Received request: transaction_id=%s", req.TransactionId)

		select {
		case c.workChan <- req:
		case <-streamCtx.Done():
			recvErr = streamCtx.Err()
			goto cleanup
		}
	}

cleanup:
	// Tear down in-flight workers. Cancelling streamCtx was already done
	// via defer; close(workChan) lets the worker goroutines exit their
	// range loop cleanly.
	close(c.workChan)
	c.wg.Wait()
	close(responseChan)
	c.workChan = make(chan *pb.SmartContractRequest, c.config.NumWorkers*2)

	if recvErr != nil {
		return fmt.Errorf("error receiving request: %w", recvErr)
	}
	// Surface any earlier Send error the sender goroutine parked on
	// errChan so the reconnect loop sees it.
	select {
	case err := <-errChan:
		return fmt.Errorf("stream send error: %w", err)
	default:
		return nil
	}
}

// worker processes requests from the work channel
func (c *Client) worker(ctx context.Context, responseChan chan<- *pb.SmartContractResponse) {
	defer c.wg.Done()

	for {
		select {
		case req, ok := <-c.workChan:
			if !ok {
				return
			}
			c.processRequest(ctx, req, responseChan)
		case <-ctx.Done():
			return
		}
	}
}

// processRequest handles a single request
func (c *Client) processRequest(ctx context.Context, req *pb.SmartContractRequest, responseChan chan<- *pb.SmartContractResponse) {
	// Capture logs (in production, you might want a more sophisticated logging approach)
	var logs string

	// Call the user-defined Process function
	result := Process(ctx, req.TransactionJson, req.EnvVars, req.Secrets)

	// Build the response
	resp := &pb.SmartContractResponse{
		TransactionId: req.TransactionId,
		OutputToChain: result.OutputToChain,
		Logs:          logs,
	}

	if result.Error != nil {
		resp.Error = result.Error.Error()
		c.logger.Printf("Error processing transaction %s: %v", req.TransactionId, result.Error)
	} else {
		// Marshal the result data to JSON
		resultJSON, err := json.Marshal(result.Data)
		if err != nil {
			resp.Error = fmt.Sprintf("failed to marshal result: %v", err)
			c.logger.Printf("Error marshaling result for transaction %s: %v", req.TransactionId, err)
		} else {
			resp.ResultJson = string(resultJSON)
			c.logger.Printf("Successfully processed transaction %s", req.TransactionId)
		}
	}

	select {
	case responseChan <- resp:
	case <-ctx.Done():
	}
}

// LoadConfig loads configuration from a YAML file
func LoadConfig(path string) (*Config, error) {
	data, err := os.ReadFile(path)
	if err != nil {
		return nil, fmt.Errorf("failed to read config file: %w", err)
	}

	config := &Config{
		NumWorkers:         10,
		ReconnectDelaySecs: 3,
		MaxBackoffSeconds:  120,
	}

	if err := yaml.Unmarshal(data, config); err != nil {
		return nil, fmt.Errorf("failed to parse config file: %w", err)
	}

	// Validate required fields
	if config.ServerAddress == "" {
		return nil, fmt.Errorf("server_address is required")
	}
	if config.ChainID == "" {
		return nil, fmt.Errorf("chain_id is required")
	}
	if config.SmartContractID == "" {
		return nil, fmt.Errorf("smart_contract_id is required")
	}
	if config.APIKey == "" {
		return nil, fmt.Errorf("api_key is required")
	}

	return config, nil
}

// nextBackoff returns the duration to sleep before the next reconnect.
// Computed as base * 2^attempts with a random jitter in [0, base) and
// capped at MaxBackoffSeconds. Jitter matters when many clients
// reconnect simultaneously after a server restart — it desynchronises
// them so they don't all slam accept() at the same instant.
func nextBackoff(cfg *Config, attempts int) time.Duration {
	base := time.Duration(cfg.ReconnectDelaySecs) * time.Second
	if base <= 0 {
		base = 3 * time.Second
	}
	maxBackoff := time.Duration(cfg.MaxBackoffSeconds) * time.Second
	if maxBackoff <= 0 {
		maxBackoff = 120 * time.Second
	}

	// Cap the exponent so we don't overflow. 2^10 = 1024 ≈ always
	// clipped by maxBackoff anyway, but keep the math bounded.
	shift := attempts
	if shift > 10 {
		shift = 10
	}
	delay := base << shift
	if delay > maxBackoff {
		delay = maxBackoff
	}

	// Jitter range == base, independent of attempts. Adding it ensures
	// we don't schedule a thundering herd on the next attempt even if
	// every client started with the same `attempts` count.
	jitter := time.Duration(rand.Int63n(int64(base)))
	return delay + jitter
}

func main() {
	configPath := flag.String("config", "config.yaml", "Path to configuration file")
	flag.Parse()

	// Load configuration
	config, err := LoadConfig(*configPath)
	if err != nil {
		log.Fatalf("Failed to load config: %v", err)
	}

	// Create client
	client := NewClient(config)

	// Setup signal handling for graceful shutdown
	ctx, cancel := context.WithCancel(context.Background())
	defer cancel()

	sigChan := make(chan os.Signal, 1)
	signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
	go func() {
		sig := <-sigChan
		log.Printf("Received signal %v, shutting down...", sig)
		cancel()
	}()

	// Connection loop with reconnection logic. A "successful" session
	// is defined as one where Run() was active for at least 60 s —
	// anything longer is almost certainly real work, so reset attempts
	// so the next failure starts the backoff schedule fresh.
	attempts := 0
	const healthyRunThreshold = 60 * time.Second

	for {
		if err := client.Connect(); err != nil {
			log.Printf("Connection failed: %v", err)
		} else {
			start := time.Now()
			if err := client.Run(ctx); err != nil {
				if ctx.Err() != nil {
					log.Println("Shutdown requested")
					_ = client.Close()
					break
				}
				log.Printf("Stream error: %v", err)
			}
			if time.Since(start) > healthyRunThreshold {
				attempts = 0
			}
		}

		_ = client.Close()

		// Check if we should stop reconnecting
		if ctx.Err() != nil {
			break
		}

		attempts++
		if config.MaxReconnectAttempts > 0 && attempts >= config.MaxReconnectAttempts {
			log.Printf("Max reconnection attempts (%d) reached, exiting", config.MaxReconnectAttempts)
			break
		}

		delay := nextBackoff(config, attempts-1)
		log.Printf("Reconnecting in %v (attempt %d)...", delay, attempts)

		select {
		case <-time.After(delay):
		case <-ctx.Done():
			return
		}
	}

	log.Println("Client shut down")
}