proc: workarounds for runtime.clone (#1470)

runtime.clone (on some operating systems?) work similarly to fork:
when a thread calls runtime.clone a new thread is created. For a
short period of time both the parent thread and the child thread
appear to be running the same goroutine, until the child thread
adjusts its TLS to point to the correct goroutine.

This means that proc.GetG for a thread that's currently running
'runtime.clone' could be wrong and, consequently, the field
proc.(G).thread of a G struct returned by GoroutinesInfo could be
also wrong. And, finally, that FindGoroutine could sometimes return
a *G with a bad associated thread if the goroutine of interest
recently called 'runtime.clone'.

To work around this problem this commit makes two changes:

1. proc.GetG will return nil for all threads executing runtime.clone.
2. FindGoroutine will return the selected goroutine as long as the
   ID matches the one requested.

Change (1) takes care of the 'runtime.clone' problem. If we stop
the target process shortly after a thread executed the SYSCALL
instruction in 'runtime.clone' there are three possibilities:

a. Both the parent thread and the child thread are stopped inside
'runtime.clone'. In this case the state we report is slightly
incorrect, because both threads will be reported as not running any
goroutine when we do know which goorutine one of them (the parent)
is running. This doesn't actually matter since runtime.clone is
always called on the system stack and therefore the goroutine in
runtime.allgs will have the correct location.

b. The child thread managed to exit 'runtime.clone' but the parent
thread didn't. This is similar to (a) but in this case GetG on the
child thread will return the correct goroutine. GetG on the parent
thread will still return (incorrectly) nil but this doesn't matter
for the samer reason as described in (a).

c. The parent thread managed to exit 'runtime.clone' but the child
thread didn't. In this case GetG will return the correct goroutine
both for the parent thread (because it's not executing runtime.clone)
and the child thread.

Change (2) means that even if a thread has a completely nonsensical
TLS (for example because it's set through cgo) evaluating a variable
with a valid GoroutineID will still work as long as it's the current
goroutine (which is the most common case). This change also doubles
as an optimization for FindGoroutine.

Fixes #1469
This commit is contained in:
Alessandro Arzilli 2019-02-26 18:22:33 +01:00 committed by Derek Parker
parent 3ba4bcf488
commit 520d792422
4 changed files with 71 additions and 13 deletions

14
_fixtures/issue1469.go Normal file

@ -0,0 +1,14 @@
package main
import "time"
func main() {
for i := 0; i < 5; i++ {
go func() {
time.Sleep(11 * time.Millisecond)
}()
}
x := 255
println(x) //break
}

@ -588,22 +588,30 @@ func GoroutinesInfo(dbp Process, start, count int) ([]*G, int, error) {
// FindGoroutine returns a G struct representing the goroutine
// specified by `gid`.
func FindGoroutine(dbp Process, gid int) (*G, error) {
if gid == -1 {
return dbp.SelectedGoroutine(), nil
if selg := dbp.SelectedGoroutine(); (gid == -1) || (selg != nil && selg.ID == gid) || (selg == nil && gid == 0) {
// Return the currently selected goroutine in the following circumstances:
//
// 1. if the caller asks for gid == -1 (because that's what a goroutine ID of -1 means in our API).
// 2. if gid == selg.ID.
// this serves two purposes: (a) it's an optimizations that allows us
// to avoid reading any other goroutine and, more importantly, (b) we
// could be reading an incorrect value for the goroutine ID of a thread.
// This condition usually happens when a goroutine calls runtime.clone
// and for a short period of time two threads will appear to be running
// the same goroutine.
// 3. if the caller asks for gid == 0 and the selected goroutine is
// either 0 or nil.
// Goroutine 0 is special, it either means we have no current goroutine
// (for example, running C code), or that we are running on a speical
// stack (system stack, signal handling stack) and we didn't properly
// detect it.
// Since there could be multiple goroutines '0' running simultaneously
// if the user requests it return the one that's already selected or
// nil if there isn't a selected goroutine.
return selg, nil
}
if gid == 0 {
// goroutine 0 is special, it either means we have no current goroutine
// (for example, running C code), or that we are running on a special
// stack (system stack, signal handling stack) and we didn't properly
// detect.
// If the user requested goroutine 0 and we the current thread is running
// on a goroutine 0 (or no goroutine at all) return the goroutine running
// on the current thread.
g, _ := GetG(dbp.CurrentThread())
if g == nil || g.ID == 0 {
return g, nil
}
return nil, fmt.Errorf("Unknown goroutine %d", gid)
}

@ -4164,3 +4164,35 @@ func TestGoroutinesInfoLimit(t *testing.T) {
}
})
}
func TestIssue1469(t *testing.T) {
withTestProcess("issue1469", t, func(p proc.Process, fixture protest.Fixture) {
setFileBreakpoint(p, t, fixture, 13)
assertNoError(proc.Continue(p), t, "Continue()")
gid2thread := make(map[int][]proc.Thread)
for _, thread := range p.ThreadList() {
g, _ := proc.GetG(thread)
if g == nil {
continue
}
gid2thread[g.ID] = append(gid2thread[g.ID], thread)
}
for gid := range gid2thread {
if len(gid2thread[gid]) > 1 {
t.Logf("too many threads running goroutine %d", gid)
for _, thread := range gid2thread[gid] {
t.Logf("\tThread %d", thread.ThreadID())
frames, err := proc.ThreadStacktrace(thread, 20)
if err != nil {
t.Logf("\t\tcould not get stacktrace %v", err)
}
for _, frame := range frames {
t.Logf("\t\t%#x at %s:%d (systemstack: %v)", frame.Call.PC, frame.Call.File, frame.Call.Line, frame.SystemStack)
}
}
}
}
})
}

@ -477,6 +477,10 @@ func newGVariable(thread Thread, gaddr uintptr, deref bool) (*Variable, error) {
// In order to get around all this craziness, we read the address of the G structure for
// the current thread from the thread local storage area.
func GetG(thread Thread) (*G, error) {
if loc, _ := thread.Location(); loc.Fn != nil && loc.Fn.Name == "runtime.clone" {
// When threads are executing runtime.clone the value of TLS is unreliable.
return nil, nil
}
gaddr, err := getGVariable(thread)
if err != nil {
return nil, err