proc/native: fix race condition between Halt and process death (linux)
If a breakpoint is hit close to process death on a thread that isn't the group leader the process could die while we are trying to stop it. This can be easily reproduced by having the goroutine that's executing main.main (which will almost always run on the thread group leader) wait for a second goroutine before exiting, then setting a breakpoint on the second goroutine and stepping through it (see TestIssue1101 in proc_test.go). When stepping over the return instruction of main.f the deferred wg.Done() call will be executed which will cause the main goroutine to resume and proceed to exit. Both the temporary breakpoint on wg.Done and the temporary breakpoint on the return address of main.f will be in close proximity to main.main calling os.Exit() and causing the death of the thread group leader. Under these circumstances the call to native.(*Thread).waitFast in native.(*Thread).halt can hang forever due to a bug similar to https://sourceware.org/bugzilla/show_bug.cgi?id=12702 (see comment in native.(*Thread).wait for an explanation). Replacing waitFast with a normal wait work in most circumstances, however, besides the performance hit, it looks like in this circumstances trapWait sometimes receives a spurious SIGTRAP on the dying group leader which would cause the subsequent call to wait in halt to accidentally reap the process without noting that it did exit. Instead this patch removes the call to wait from halt and instead calls trapWait in a loop in setCurrentBreakpoints until all threads are set to running=false. This is also a better fix than the workaround to ESRCH error while setting current breakpoints implemented in 94b50d. Fixes #1101
This commit is contained in:
parent
fbd152f6a8
commit
f32ce1b21d
20
_fixtures/issue1101.go
Normal file
20
_fixtures/issue1101.go
Normal file
@ -0,0 +1,20 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
"sync"
|
||||
)
|
||||
|
||||
var wg sync.WaitGroup
|
||||
|
||||
func f(from string) {
|
||||
defer wg.Done()
|
||||
return
|
||||
}
|
||||
|
||||
func main() {
|
||||
wg.Add(1)
|
||||
go f("goroutine")
|
||||
wg.Wait()
|
||||
os.Exit(2)
|
||||
}
|
@ -230,10 +230,20 @@ func (dbp *Process) trapWait(pid int) (*Thread, error) {
|
||||
if err != nil {
|
||||
if err == sys.ESRCH {
|
||||
// thread died while we were adding it
|
||||
delete(dbp.threads, int(cloned))
|
||||
continue
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
dbp.haltMu.Lock()
|
||||
halt := dbp.halt
|
||||
dbp.haltMu.Unlock()
|
||||
if halt {
|
||||
dbp.halt = false
|
||||
th.running = false
|
||||
dbp.threads[int(wpid)].running = false
|
||||
return nil, nil
|
||||
}
|
||||
if err = th.Continue(); err != nil {
|
||||
if err == sys.ESRCH {
|
||||
// thread died while we were adding it
|
||||
@ -256,7 +266,7 @@ func (dbp *Process) trapWait(pid int) (*Thread, error) {
|
||||
dbp.haltMu.Lock()
|
||||
halt := dbp.halt
|
||||
dbp.haltMu.Unlock()
|
||||
if status.StopSignal() == sys.SIGTRAP && halt {
|
||||
if halt && (status.StopSignal() == sys.SIGTRAP || status.StopSignal() == sys.SIGSTOP) {
|
||||
th.running = false
|
||||
dbp.halt = false
|
||||
return th, nil
|
||||
@ -368,16 +378,28 @@ func (dbp *Process) wait(pid, options int) (int, *sys.WaitStatus, error) {
|
||||
}
|
||||
|
||||
func (dbp *Process) setCurrentBreakpoints(trapthread *Thread) error {
|
||||
// wait for all threads to stop
|
||||
for {
|
||||
allstopped := true
|
||||
for _, th := range dbp.threads {
|
||||
if th.running {
|
||||
allstopped = false
|
||||
break
|
||||
}
|
||||
}
|
||||
if allstopped {
|
||||
break
|
||||
}
|
||||
dbp.halt = true
|
||||
_, err := dbp.trapWait(-1)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
for _, th := range dbp.threads {
|
||||
if th.CurrentBreakpoint.Breakpoint == nil {
|
||||
if err := th.SetCurrentBreakpoint(); err != nil {
|
||||
if err == sys.ESRCH {
|
||||
// This thread quit between the point where we received the breakpoint and
|
||||
// the stop signal.
|
||||
delete(dbp.threads, th.ID)
|
||||
} else {
|
||||
return err
|
||||
}
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -120,22 +120,6 @@ func (thread *Thread) Stopped() bool {
|
||||
return thread.stopped()
|
||||
}
|
||||
|
||||
// Halt stops this thread from executing. Actual
|
||||
// implementation is OS dependant. Look in OS
|
||||
// thread file.
|
||||
func (thread *Thread) Halt() (err error) {
|
||||
defer func() {
|
||||
if err == nil {
|
||||
thread.running = false
|
||||
}
|
||||
}()
|
||||
if thread.Stopped() {
|
||||
return
|
||||
}
|
||||
err = thread.halt()
|
||||
return
|
||||
}
|
||||
|
||||
// SetCurrentBreakpoint sets the current breakpoint that this
|
||||
// thread is stopped at as CurrentBreakpoint on the thread struct.
|
||||
func (thread *Thread) SetCurrentBreakpoint() error {
|
||||
|
@ -27,6 +27,20 @@ type OSSpecificDetails struct {
|
||||
// be continued.
|
||||
var ErrContinueThread = fmt.Errorf("could not continue thread")
|
||||
|
||||
// Halt stops this thread from executing.
|
||||
func (thread *Thread) Halt() (err error) {
|
||||
defer func() {
|
||||
if err == nil {
|
||||
thread.running = false
|
||||
}
|
||||
}()
|
||||
if thread.Stopped() {
|
||||
return
|
||||
}
|
||||
err = thread.halt()
|
||||
return
|
||||
}
|
||||
|
||||
func (t *Thread) halt() (err error) {
|
||||
kret := C.thread_suspend(t.os.threadAct)
|
||||
if kret != C.KERN_SUCCESS {
|
||||
|
@ -16,17 +16,21 @@ type OSSpecificDetails struct {
|
||||
registers sys.PtraceRegs
|
||||
}
|
||||
|
||||
// Halt stops this thread from executing.
|
||||
func (thread *Thread) Halt() (err error) {
|
||||
if thread.Stopped() {
|
||||
return
|
||||
}
|
||||
err = thread.halt()
|
||||
return
|
||||
}
|
||||
|
||||
func (t *Thread) halt() (err error) {
|
||||
err = sys.Tgkill(t.dbp.pid, t.ID, sys.SIGSTOP)
|
||||
if err != nil {
|
||||
err = fmt.Errorf("halt err %s on thread %d", err, t.ID)
|
||||
return
|
||||
}
|
||||
_, _, err = t.dbp.waitFast(t.ID)
|
||||
if err != nil {
|
||||
err = fmt.Errorf("wait err %s on thread %d", err, t.ID)
|
||||
return
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
|
@ -18,6 +18,20 @@ type OSSpecificDetails struct {
|
||||
hThread syscall.Handle
|
||||
}
|
||||
|
||||
// Halt stops this thread from executing.
|
||||
func (thread *Thread) Halt() (err error) {
|
||||
defer func() {
|
||||
if err == nil {
|
||||
thread.running = false
|
||||
}
|
||||
}()
|
||||
if thread.Stopped() {
|
||||
return
|
||||
}
|
||||
err = thread.halt()
|
||||
return
|
||||
}
|
||||
|
||||
func (t *Thread) halt() (err error) {
|
||||
// Ignore the request to halt. On Windows, all threads are halted
|
||||
// on return from WaitForDebugEvent.
|
||||
|
@ -3410,3 +3410,45 @@ func TestIssue1137(t *testing.T) {
|
||||
assertNoError(v2.Unreadable, t, "iface2 unreadable")
|
||||
})
|
||||
}
|
||||
|
||||
func TestIssue1101(t *testing.T) {
|
||||
// If a breakpoint is hit close to process death on a thread that isn't the
|
||||
// group leader the process could die while we are trying to stop it.
|
||||
//
|
||||
// This can be easily reproduced by having the goroutine that's executing
|
||||
// main.main (which will almost always run on the thread group leader) wait
|
||||
// for a second goroutine before exiting, then setting a breakpoint on the
|
||||
// second goroutine and stepping through it (see TestIssue1101 in
|
||||
// proc_test.go).
|
||||
//
|
||||
// When stepping over the return instruction of main.f the deferred
|
||||
// wg.Done() call will be executed which will cause the main goroutine to
|
||||
// resume and proceed to exit. Both the temporary breakpoint on wg.Done and
|
||||
// the temporary breakpoint on the return address of main.f will be in
|
||||
// close proximity to main.main calling os.Exit() and causing the death of
|
||||
// the thread group leader.
|
||||
|
||||
withTestProcess("issue1101", t, func(p proc.Process, fixture protest.Fixture) {
|
||||
_, err := setFunctionBreakpoint(p, "main.f")
|
||||
assertNoError(err, t, "setFunctionBreakpoint()")
|
||||
assertNoError(proc.Continue(p), t, "Continue()")
|
||||
assertNoError(proc.Next(p), t, "Next() 1")
|
||||
assertNoError(proc.Next(p), t, "Next() 2")
|
||||
lastCmd := "Next() 3"
|
||||
exitErr := proc.Next(p)
|
||||
if exitErr == nil {
|
||||
lastCmd = "final Continue()"
|
||||
exitErr = proc.Continue(p)
|
||||
}
|
||||
if pexit, exited := exitErr.(proc.ProcessExitedError); exited {
|
||||
if pexit.Status != 2 && testBackend != "lldb" {
|
||||
// looks like there's a bug with debugserver on macOS that sometimes
|
||||
// will report exit status 0 instead of the proper exit status.
|
||||
t.Fatalf("process exited status %d (expected 2)", pexit.Status)
|
||||
}
|
||||
} else {
|
||||
assertNoError(exitErr, t, lastCmd)
|
||||
t.Fatalf("process did not exit after %s", lastCmd)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user