proc/gdbserial: support call injection with rr backend (#2740)

Normally calls can't be performed on recorded processes, becuase the future instructions executed by the target are predetermined. The rr debugger however has a mechanism that allows this by taking the current state of the recording and allowing it to diverge from the recording, temporarily. This commit adds support for starting and ending such diversions around function calls. Note: this requires rr version 5.5 of later to work, see: https://github.com/rr-debugger/rr/pull/2748
2021-10-14 20:06:14 +02:00 · 2021-10-14 20:06:14 +02:00 · 9a5d5bc996
commit 9a5d5bc996
parent c8f6c3a685
11 changed files with 120 additions and 14 deletions
--- a/pkg/proc/core/core.go
+++ b/pkg/proc/core/core.go
@ -278,6 +278,9 @@ func (dbp *process) SetUProbe(fnName string, goidOffset int64, args []ebpf.UProb
 	panic("not implemented")
 }

+// StartCallInjection notifies the backend that we are about to inject a function call.
+func (p *process) StartCallInjection() (func(), error) { return func() {}, nil }
+
 // ReadMemory will return memory from the core file at the specified location and put the
 // read memory into `data`, returning the length read, and returning an error if
 // the length read is shorter than the length of the `data` buffer.
--- a/pkg/proc/fncall.go
+++ b/pkg/proc/fncall.go
@ -132,6 +132,7 @@ type callInjection struct {
 	continueCompleted chan<- *G
 	continueRequest   <-chan continueRequest
 	startThreadID     int
+	endCallInjection  func()
 }

 func (callCtx *callContext) doContinue() *G {
@ -188,10 +189,16 @@ func EvalExpressionWithCalls(t *Target, g *G, expr string, retLoadCfg LoadConfig
 		continueCompleted: continueCompleted,
 	}

+	endCallInjection, err := t.proc.StartCallInjection()
+	if err != nil {
+		return err
+	}
+
 	t.fncallForG[g.ID] = &callInjection{
 		continueCompleted: continueCompleted,
 		continueRequest:   continueRequest,
 		startThreadID:     0,
+		endCallInjection:  endCallInjection,
 	}

 	go scope.EvalExpression(expr, retLoadCfg)
@ -230,7 +237,13 @@ func finishEvalExpressionWithCalls(t *Target, g *G, contReq continueRequest, ok
 	}

 	close(t.fncallForG[g.ID].continueCompleted)
-	delete(t.fncallForG, g.ID)
+	callinj := t.fncallForG[g.ID]
+	for goid := range t.fncallForG {
+		if t.fncallForG[goid] == callinj {
+			delete(t.fncallForG, goid)
+		}
+	}
+	callinj.endCallInjection()
 	return err
 }

--- a/pkg/proc/gdbserial/gdbserver.go
+++ b/pkg/proc/gdbserial/gdbserver.go
@ -131,6 +131,10 @@ var debugserverExecutablePaths = []string{
 // while there are still internal breakpoints set.
 var ErrDirChange = errors.New("direction change with internal breakpoints")

+// ErrStartCallInjectionBackwards is returned when trying to start a call
+// injection while the recording is being run backwards.
+var ErrStartCallInjectionBackwards = errors.New("can not start a call injection while running backwards")
+
 var checkCanUnmaskSignalsOnce sync.Once
 var canUnmaskSignalsCached bool

@ -151,7 +155,8 @@ type gdbProcess struct {

 	breakpoints proc.BreakpointMap

-	gcmdok         bool   // true if the stub supports g and G commands
+	gcmdok         bool   // true if the stub supports g and (maybe) G commands
+	_Gcmdok        bool   // true if the stub supports G command
 	threadStopInfo bool   // true if the stub supports qThreadStopInfo
 	tracedir       string // if attached to rr the path to the trace directory

@ -1207,6 +1212,39 @@ func (p *gdbProcess) ChangeDirection(dir proc.Direction) error {
 	return nil
 }

+// StartCallInjection notifies the backend that we are about to inject a function call.
+func (p *gdbProcess) StartCallInjection() (func(), error) {
+	if p.tracedir == "" {
+		return func() {}, nil
+	}
+	if p.conn.conn == nil {
+		return nil, proc.ErrProcessExited{Pid: p.conn.pid}
+	}
+	if p.conn.direction != proc.Forward {
+		return nil, ErrStartCallInjectionBackwards
+	}
+
+	// Normally it's impossible to inject function calls in a recorded target
+	// because the sequence of instructions that the target will execute is
+	// predetermined.
+	// RR however allows this in a "diversion". When a diversion is started rr
+	// takes the current state of the process and runs it forward as a normal
+	// process, not following the recording.
+	// The gdb serial protocol does not have a way to start a diversion and gdb
+	// (the main frontend of rr) does not know how to do it. Instead a
+	// diversion is started by reading siginfo, because that's the first
+	// request gdb does when starting a function call injection.
+
+	_, err := p.conn.qXfer("siginfo", "", true)
+	if err != nil {
+		return nil, err
+	}
+
+	return func() {
+		_ = p.conn.qXferWrite("siginfo", "") // rr always returns an error for qXfer:siginfo:write... even though it works
+	}, nil
+}
+
 // GetDirection returns the current direction of execution.
 func (p *gdbProcess) GetDirection() proc.Direction {
 	return p.conn.direction
@ -1649,8 +1687,14 @@ func (t *gdbThread) writeSomeRegisters(regNames ...string) error {
 }

 func (t *gdbThread) writeRegisters() error {
-	if t.p.gcmdok {
-		return t.p.conn.writeRegisters(t.strID, t.regs.buf)
+	if t.p.gcmdok && t.p._Gcmdok {
+		err := t.p.conn.writeRegisters(t.strID, t.regs.buf)
+		if isProtocolErrorUnsupported(err) {
+			t.p._Gcmdok = false
+		} else {
+			return err
+		}
+
 	}
 	for _, r := range t.regs.regs {
 		if r.ignoreOnWrite {
--- a/pkg/proc/gdbserial/gdbserver_conn.go
+++ b/pkg/proc/gdbserial/gdbserver_conn.go
@ -47,6 +47,8 @@ type gdbConn struct {
 	xcmdok                bool // x command can be used to transfer memory
 	goarch                string

+	useXcmd bool // forces writeMemory to use the 'X' command
+
 	log *logrus.Entry
 }

@ -405,6 +407,17 @@ func (conn *gdbConn) qXfer(kind, annex string, binary bool) ([]byte, error) {
 	return out, nil
 }

+// qXferWrite executes a 'qXfer' write with the specified kind and annex.
+func (conn *gdbConn) qXferWrite(kind, annex string) error {
+	conn.outbuf.Reset()
+	fmt.Fprintf(&conn.outbuf, "$qXfer:%s:write:%s:0:", kind, annex)
+	//TODO(aarzilli): if we ever actually need to write something with qXfer,
+	//this will need to be implemented properly. At the moment it is only used
+	//for a fake write to the siginfo kind, to end a diversion in 'rr'.
+	_, err := conn.exec(conn.outbuf.Bytes(), "qXfer")
+	return err
+}
+
 type breakpointType uint8

 const (
@ -988,8 +1001,16 @@ func writeAsciiBytes(w io.Writer, data []byte) {
 	}
 }

-// executes 'M' (write memory) command
+// writeMemory writes memory using either 'M' or 'X'
 func (conn *gdbConn) writeMemory(addr uint64, data []byte) (written int, err error) {
+	if conn.useXcmd {
+		return conn.writeMemoryBinary(addr, data)
+	}
+	return conn.writeMemoryHex(addr, data)
+}
+
+// executes 'M' (write memory) command
+func (conn *gdbConn) writeMemoryHex(addr uint64, data []byte) (written int, err error) {
 	if len(data) == 0 {
 		// LLDB can't parse requests for 0-length writes and hangs if we emit them
 		return 0, nil
@ -1007,6 +1028,27 @@ func (conn *gdbConn) writeMemory(addr uint64, data []byte) (written int, err err
 	return len(data), nil
 }

+func (conn *gdbConn) writeMemoryBinary(addr uint64, data []byte) (written int, err error) {
+	conn.outbuf.Reset()
+	fmt.Fprintf(&conn.outbuf, "$X%x,%x:", addr, len(data))
+
+	for _, b := range data {
+		switch b {
+		case '#', '$', '}':
+			conn.outbuf.WriteByte('}')
+			conn.outbuf.WriteByte(b ^ escapeXor)
+		default:
+			conn.outbuf.WriteByte(b)
+		}
+	}
+
+	_, err = conn.exec(conn.outbuf.Bytes(), "memory write")
+	if err != nil {
+		return 0, err
+	}
+	return len(data), nil
+}
+
 func (conn *gdbConn) allocMemory(sz uint64) (uint64, error) {
 	conn.outbuf.Reset()
 	fmt.Fprintf(&conn.outbuf, "$_M%x,rwx", sz)
--- a/pkg/proc/gdbserial/rr.go
+++ b/pkg/proc/gdbserial/rr.go
@ -154,6 +154,7 @@ func Replay(tracedir string, quiet, deleteOnDetach bool, debugInfoDirs []string)

 	p := newProcess(rrcmd.Process)
 	p.tracedir = tracedir
+	p.conn.useXcmd = true // 'rr' does not support the 'M' command which is what we would usually use to write memory, this is only important during function calls, in any other situation writing memory will fail anyway.
 	if deleteOnDetach {
 		p.onDetach = func() {
 			safeRemoveAll(p.tracedir)
--- a/pkg/proc/interface.go
+++ b/pkg/proc/interface.go
@ -55,6 +55,9 @@ type ProcessInternal interface {
 	MemoryMap() ([]MemoryMapEntry, error)

 	GetBufferedTracepoints() []ebpf.RawUProbeParams
+
+	// StartCallInjection notifies the backend that we are about to inject a function call.
+	StartCallInjection() (func(), error)
 }

 // RecordingManipulation is an interface for manipulating process recordings.
--- a/pkg/proc/native/proc.go
+++ b/pkg/proc/native/proc.go
@ -106,6 +106,9 @@ func (dbp *nativeProcess) Checkpoints() ([]proc.Checkpoint, error) { return nil,
 // only supported in recorded traces.
 func (dbp *nativeProcess) ClearCheckpoint(int) error { return proc.ErrNotRecorded }

+// StartCallInjection notifies the backend that we are about to inject a function call.
+func (dbp *nativeProcess) StartCallInjection() (func(), error) { return func() {}, nil }
+
 // Detach from the process being debugged, optionally killing it.
 func (dbp *nativeProcess) Detach(kill bool) (err error) {
 	if dbp.exited {
--- a/pkg/proc/target.go
+++ b/pkg/proc/target.go
@ -246,9 +246,6 @@ func (t *Target) Valid() (bool, error) {
 // Currently only non-recorded processes running on AMD64 support
 // function calls.
 func (t *Target) SupportsFunctionCalls() bool {
-	if ok, _ := t.Process.Recorded(); ok {
-		return false
-	}
 	return t.Process.BinInfo().Arch.Name == "amd64"
 }

--- a/pkg/proc/target_exec.go
+++ b/pkg/proc/target_exec.go
@ -125,10 +125,6 @@ func (dbp *Target) Continue() error {
 		switch {
 		case curbp.Breakpoint == nil:
 			// runtime.Breakpoint, manual stop or debugCallV1-related stop
-			recorded, _ := dbp.Recorded()
-			if recorded {
-				return conditionErrors(threads)
-			}

 			loc, err := curthread.Location()
 			if err != nil || loc.Fn == nil {
@ -139,6 +135,9 @@ func (dbp *Target) Continue() error {

 			switch {
 			case loc.Fn.Name == "runtime.breakpoint":
+				if recorded, _ := dbp.Recorded(); recorded {
+					return conditionErrors(threads)
+				}
 				// In linux-arm64, PtraceSingleStep seems cannot step over BRK instruction
 				// (linux-arm64 feature or kernel bug maybe).
 				if !arch.BreakInstrMovesPC() {
--- a/pkg/proc/test/support.go
+++ b/pkg/proc/test/support.go
@ -314,7 +314,7 @@ func MustSupportFunctionCalls(t *testing.T, testBackend string) {
 		t.Skip("this version of Go does not support function calls")
 	}

-	if testBackend == "rr" || (runtime.GOOS == "darwin" && testBackend == "native") {
+	if runtime.GOOS == "darwin" && testBackend == "native" {
 		t.Skip("this backend does not support function calls")
 	}

--- a/service/test/variables_test.go
+++ b/service/test/variables_test.go
@ -1171,6 +1171,7 @@ type testCaseCallFunction struct {

 func TestCallFunction(t *testing.T) {
 	protest.MustSupportFunctionCalls(t, testBackend)
+	protest.AllowRecording(t)

 	var testcases = []testCaseCallFunction{
 		// Basic function call injection tests
@ -1396,7 +1397,7 @@ func testCallFunction(t *testing.T, p *proc.Target, tc testCaseCallFunction) {
 	}

 	if len(retvals) != len(tc.outs) {
-		t.Fatalf("call %q: wrong number of return parameters", tc.expr)
+		t.Fatalf("call %q: wrong number of return parameters (%#v)", tc.expr, retvals)
 	}

 	for i := range retvals {