524 lines
12 KiB
Markdown
524 lines
12 KiB
Markdown
# Process Management: Reliable Transcoding Termination
|
|
|
|
## Stash's Transcoding Architecture
|
|
|
|
### Core Problem
|
|
Live transcoding processes must terminate reliably when:
|
|
- Client disconnects
|
|
- User navigates away
|
|
- Server shuts down
|
|
- Network interruptions occur
|
|
|
|
### Three-Layer Termination Strategy
|
|
|
|
#### Layer 1: HTTP Context Cancellation
|
|
```go
|
|
// From internal/api/routes_scene.go
|
|
func (rs sceneRoutes) streamTranscode(w http.ResponseWriter, r *http.Request, streamType ffmpeg.StreamFormat) {
|
|
// Uses request context for automatic cleanup
|
|
streamManager.ServeTranscode(w, r, options)
|
|
}
|
|
```
|
|
|
|
#### Layer 2: Context-Based Process Tracking
|
|
```go
|
|
// From internal/manager/stream_manager.go
|
|
type LockContext struct {
|
|
context.Context
|
|
cancel context.CancelFunc
|
|
cmd *exec.Cmd
|
|
mu sync.Mutex
|
|
}
|
|
|
|
func (c *LockContext) Cancel() {
|
|
c.cancel() // Signal cancellation
|
|
if c.cmd != nil {
|
|
// Graceful termination with timeout
|
|
done := make(chan error)
|
|
go func() {
|
|
err := c.cmd.Wait()
|
|
done <- err
|
|
}()
|
|
|
|
select {
|
|
case <-done:
|
|
return
|
|
case <-time.After(5 * time.Second):
|
|
// Force kill after timeout
|
|
if c.cmd.Process != nil {
|
|
c.cmd.Process.Kill()
|
|
}
|
|
return
|
|
}
|
|
}
|
|
}
|
|
```
|
|
|
|
#### Layer 3: Centralized Process Registry
|
|
```go
|
|
// From internal/manager/stream_manager.go
|
|
type StreamManager struct {
|
|
lockManager *fsutil.ReadLockManager
|
|
processes map[string]*LockContext
|
|
mu sync.RWMutex
|
|
}
|
|
|
|
func (s *StreamManager) Shutdown() {
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
|
|
for _, ctx := range s.processes {
|
|
ctx.Cancel() // Kill all active processes
|
|
}
|
|
s.processes = make(map[string]*LockContext)
|
|
}
|
|
```
|
|
|
|
## Key Implementation Patterns
|
|
|
|
### 1. Context Propagation
|
|
```go
|
|
// Create context with cancellation
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
|
|
// Attach to HTTP request lifecycle
|
|
ctx = context.WithValue(ctx, "request_id", requestID)
|
|
|
|
// Use throughout transcoding pipeline
|
|
ffmpegCmd := exec.CommandContext(ctx, "ffmpeg", args...)
|
|
```
|
|
|
|
### 2. Process Attachment Pattern
|
|
```go
|
|
type TranscodeContext struct {
|
|
context.Context
|
|
cmd *exec.Cmd
|
|
startTime time.Time
|
|
filePath string
|
|
}
|
|
|
|
func (c *TranscodeContext) AttachCommand(cmd *exec.Cmd) {
|
|
c.cmd = cmd
|
|
c.startTime = time.Now()
|
|
}
|
|
|
|
func (c *TranscodeContext) Terminate() error {
|
|
if c.cmd == nil {
|
|
return nil
|
|
}
|
|
|
|
// Send SIGTERM first
|
|
if err := c.cmd.Process.Signal(syscall.SIGTERM); err != nil {
|
|
// Fallback to SIGKILL
|
|
return c.cmd.Process.Kill()
|
|
}
|
|
|
|
// Wait for graceful shutdown
|
|
done := make(chan error, 1)
|
|
go func() {
|
|
done <- c.cmd.Wait()
|
|
}()
|
|
|
|
select {
|
|
case <-done:
|
|
return nil
|
|
case <-time.After(5 * time.Second):
|
|
return c.cmd.Process.Kill()
|
|
}
|
|
}
|
|
```
|
|
|
|
### 3. File Lock Management
|
|
```go
|
|
// From pkg/fsutil/lock_manager.go
|
|
type ReadLockManager struct {
|
|
activeLocks map[string]*lockEntry
|
|
mu sync.RWMutex
|
|
}
|
|
|
|
type lockEntry struct {
|
|
ctx context.Context
|
|
cancel context.CancelFunc
|
|
refCount int
|
|
filePath string
|
|
}
|
|
|
|
func (m *ReadLockManager) AddLock(filePath string) context.Context {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
if entry, exists := m.activeLocks[filePath]; exists {
|
|
entry.refCount++
|
|
return entry.ctx
|
|
}
|
|
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
m.activeLocks[filePath] = &lockEntry{
|
|
ctx: ctx,
|
|
cancel: cancel,
|
|
refCount: 1,
|
|
filePath: filePath,
|
|
}
|
|
|
|
return ctx
|
|
}
|
|
|
|
func (m *ReadLockManager) RemoveLock(filePath string) {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
|
|
if entry, exists := m.activeLocks[filePath]; exists {
|
|
entry.refCount--
|
|
if entry.refCount <= 0 {
|
|
entry.cancel()
|
|
delete(m.activeLocks, filePath)
|
|
}
|
|
}
|
|
}
|
|
```
|
|
|
|
### 4. HTTP Connection Management
|
|
```go
|
|
// Enhanced HTTP handler with connection monitoring
|
|
func (s *StreamManager) ServeTranscode(w http.ResponseWriter, r *http.Request, options TranscodeOptions) {
|
|
// Create context from request
|
|
ctx, cancel := context.WithCancel(r.Context())
|
|
defer cancel()
|
|
|
|
// Monitor connection state
|
|
go func() {
|
|
<-r.Context().Done()
|
|
cancel() // Client disconnected
|
|
}()
|
|
|
|
// Hijack connection for immediate cleanup
|
|
if hj, ok := w.(http.Hijacker); ok {
|
|
conn, _, err := hj.Hijack()
|
|
if err == nil {
|
|
defer conn.Close()
|
|
}
|
|
}
|
|
|
|
// Start transcoding
|
|
cmd := exec.CommandContext(ctx, "ffmpeg", args...)
|
|
|
|
// Stream output
|
|
w.Header().Set("Content-Type", "video/mp4")
|
|
w.Header().Set("Cache-Control", "no-cache")
|
|
|
|
cmd.Stdout = w
|
|
if err := cmd.Start(); err != nil {
|
|
http.Error(w, err.Error(), http.StatusInternalServerError)
|
|
return
|
|
}
|
|
|
|
if err := cmd.Wait(); err != nil {
|
|
if ctx.Err() != context.Canceled {
|
|
logger.Errorf("Transcoding failed: %v", err)
|
|
}
|
|
}
|
|
}
|
|
```
|
|
|
|
## Next.js Adaptation Guide
|
|
|
|
### 1. AbortController Pattern
|
|
```typescript
|
|
// pages/api/transcode/[...slug].ts
|
|
import { spawn } from 'child_process';
|
|
|
|
export default async function handler(req: NextApiRequest, res: NextApiResponse) {
|
|
const controller = new AbortController();
|
|
const { signal } = controller;
|
|
|
|
// Attach to request lifecycle
|
|
req.on('close', () => {
|
|
controller.abort();
|
|
});
|
|
|
|
req.on('aborted', () => {
|
|
controller.abort();
|
|
});
|
|
|
|
try {
|
|
const ffmpeg = spawn('ffmpeg', [
|
|
'-i', req.query.file as string,
|
|
'-f', 'mp4',
|
|
'-movflags', 'frag_keyframe+empty_moov',
|
|
'pipe:1'
|
|
], { signal });
|
|
|
|
// Handle cleanup
|
|
signal.addEventListener('abort', () => {
|
|
if (!ffmpeg.killed) {
|
|
ffmpeg.kill('SIGKILL');
|
|
}
|
|
});
|
|
|
|
// Stream response
|
|
res.setHeader('Content-Type', 'video/mp4');
|
|
res.setHeader('Cache-Control', 'no-cache');
|
|
|
|
ffmpeg.stdout.pipe(res);
|
|
|
|
const exitCode = await new Promise((resolve) => {
|
|
ffmpeg.on('exit', resolve);
|
|
});
|
|
|
|
if (exitCode !== 0) {
|
|
res.status(500).end();
|
|
}
|
|
} catch (error) {
|
|
if (signal.aborted) {
|
|
res.status(499).end(); // Client closed request
|
|
} else {
|
|
res.status(500).json({ error: 'Transcoding failed' });
|
|
}
|
|
}
|
|
}
|
|
```
|
|
|
|
### 2. Process Pool Management
|
|
```typescript
|
|
// lib/transcode/pool.ts
|
|
import { ChildProcess } from 'child_process';
|
|
|
|
class TranscodePool {
|
|
private processes = new Map<string, {
|
|
process: ChildProcess;
|
|
startTime: Date;
|
|
filePath: string;
|
|
}>();
|
|
|
|
add(key: string, process: ChildProcess, filePath: string) {
|
|
this.processes.set(key, {
|
|
process,
|
|
startTime: new Date(),
|
|
filePath
|
|
});
|
|
|
|
// Auto cleanup on exit
|
|
process.on('exit', () => {
|
|
this.processes.delete(key);
|
|
});
|
|
}
|
|
|
|
killAll() {
|
|
for (const [key, { process }] of this.processes) {
|
|
if (!process.killed) {
|
|
process.kill('SIGKILL');
|
|
}
|
|
}
|
|
this.processes.clear();
|
|
}
|
|
|
|
kill(key: string) {
|
|
const entry = this.processes.get(key);
|
|
if (entry && !entry.process.killed) {
|
|
entry.process.kill('SIGKILL');
|
|
this.processes.delete(key);
|
|
}
|
|
}
|
|
|
|
getStats() {
|
|
return Array.from(this.processes.entries()).map(([key, entry]) => ({
|
|
key,
|
|
filePath: entry.filePath,
|
|
startTime: entry.startTime,
|
|
uptime: Date.now() - entry.startTime.getTime()
|
|
}));
|
|
}
|
|
}
|
|
|
|
export const transcodePool = new TranscodePool();
|
|
```
|
|
|
|
### 3. Connection Monitoring
|
|
```typescript
|
|
// lib/transcode/connection-monitor.ts
|
|
class ConnectionMonitor {
|
|
private activeConnections = new Set<string>();
|
|
|
|
trackConnection(connectionId: string, process: ChildProcess) {
|
|
this.activeConnections.add(connectionId);
|
|
|
|
// Monitor for stale connections
|
|
const checkInterval = setInterval(() => {
|
|
if (process.killed) {
|
|
clearInterval(checkInterval);
|
|
this.activeConnections.delete(connectionId);
|
|
return;
|
|
}
|
|
|
|
// Check if connection is still alive
|
|
// This would need to be implemented based on your specific needs
|
|
}, 30000);
|
|
}
|
|
|
|
cleanupConnection(connectionId: string) {
|
|
this.activeConnections.delete(connectionId);
|
|
}
|
|
|
|
getActiveCount() {
|
|
return this.activeConnections.size;
|
|
}
|
|
}
|
|
|
|
export const connectionMonitor = new ConnectionMonitor();
|
|
```
|
|
|
|
### 4. Graceful Shutdown
|
|
```typescript
|
|
// lib/transcode/shutdown.ts
|
|
import { transcodePool } from './pool';
|
|
|
|
function setupGracefulShutdown() {
|
|
// Handle SIGTERM (Docker, PM2, etc.)
|
|
process.on('SIGTERM', () => {
|
|
console.log('Received SIGTERM, cleaning up...');
|
|
transcodePool.killAll();
|
|
process.exit(0);
|
|
});
|
|
|
|
// Handle SIGINT (Ctrl+C)
|
|
process.on('SIGINT', () => {
|
|
console.log('Received SIGINT, cleaning up...');
|
|
transcodePool.killAll();
|
|
process.exit(0);
|
|
});
|
|
|
|
// Handle uncaught exceptions
|
|
process.on('uncaughtException', (error) => {
|
|
console.error('Uncaught exception:', error);
|
|
transcodePool.killAll();
|
|
process.exit(1);
|
|
});
|
|
|
|
// Handle unhandled promise rejections
|
|
process.on('unhandledRejection', (reason, promise) => {
|
|
console.error('Unhandled rejection at:', promise, 'reason:', reason);
|
|
transcodePool.killAll();
|
|
process.exit(1);
|
|
});
|
|
}
|
|
|
|
export { setupGracefulShutdown };
|
|
```
|
|
|
|
### 5. Health Monitoring
|
|
```typescript
|
|
// lib/transcode/health.ts
|
|
class TranscodeHealthMonitor {
|
|
private healthCheckInterval: NodeJS.Timeout | null = null;
|
|
|
|
startMonitoring() {
|
|
this.healthCheckInterval = setInterval(() => {
|
|
const stats = transcodePool.getStats();
|
|
|
|
// Check for hanging processes
|
|
stats.forEach(({ key, uptime }) => {
|
|
if (uptime > 300000) { // 5 minutes
|
|
console.warn(`Killing hanging transcode: ${key}`);
|
|
transcodePool.kill(key);
|
|
}
|
|
});
|
|
|
|
// Memory usage monitoring
|
|
const memUsage = process.memoryUsage();
|
|
if (memUsage.heapUsed / memUsage.heapTotal > 0.9) {
|
|
console.warn('High memory usage, killing oldest transcodes');
|
|
// Implement cleanup strategy
|
|
}
|
|
}, 30000);
|
|
}
|
|
|
|
stopMonitoring() {
|
|
if (this.healthCheckInterval) {
|
|
clearInterval(this.healthCheckInterval);
|
|
}
|
|
}
|
|
}
|
|
|
|
export const healthMonitor = new TranscodeHealthMonitor();
|
|
```
|
|
|
|
## Production Deployment
|
|
|
|
### PM2 Configuration
|
|
```javascript
|
|
// ecosystem.config.js
|
|
module.exports = {
|
|
apps: [{
|
|
name: 'transcode-server',
|
|
script: 'server.js',
|
|
instances: 'max',
|
|
exec_mode: 'cluster',
|
|
max_memory_restart: '1G',
|
|
env: {
|
|
NODE_ENV: 'production',
|
|
TRANSCODE_TIMEOUT: '300000',
|
|
MAX_CONCURRENT_TRANSCODES: '4'
|
|
},
|
|
kill_timeout: 10000,
|
|
listen_timeout: 10000,
|
|
shutdown_with_message: true
|
|
}]
|
|
};
|
|
```
|
|
|
|
### Docker Configuration
|
|
```dockerfile
|
|
FROM node:18-alpine
|
|
|
|
# Install FFmpeg
|
|
RUN apk add --no-cache ffmpeg
|
|
|
|
WORKDIR /app
|
|
COPY package*.json ./
|
|
RUN npm ci --only=production
|
|
|
|
COPY . .
|
|
|
|
# Set resource limits
|
|
ENV NODE_OPTIONS="--max-old-space-size=1024"
|
|
|
|
EXPOSE 3000
|
|
|
|
CMD ["node", "server.js"]
|
|
```
|
|
|
|
### Kubernetes Resource Limits
|
|
```yaml
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: transcode-service
|
|
spec:
|
|
template:
|
|
spec:
|
|
containers:
|
|
- name: transcode
|
|
image: transcode-service:latest
|
|
resources:
|
|
requests:
|
|
memory: "512Mi"
|
|
cpu: "500m"
|
|
limits:
|
|
memory: "2Gi"
|
|
cpu: "2000m"
|
|
lifecycle:
|
|
preStop:
|
|
exec:
|
|
command: ["/bin/sh", "-c", "kill -TERM 1; sleep 10"]
|
|
```
|
|
|
|
## Key Takeaways
|
|
|
|
1. **Always use context-based cancellation** for reliable cleanup
|
|
2. **Implement three-layer termination** (HTTP, context, process)
|
|
3. **Monitor resource usage** to prevent system overload
|
|
4. **Handle edge cases** (network drops, server restarts)
|
|
5. **Provide graceful degradation** when services are overloaded
|
|
6. **Log termination events** for debugging and monitoring
|
|
|
|
This approach ensures 99%+ reliable process termination even under high load or adverse network conditions. |