diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index 4122b7ba237654928b86f571d16a7921b34d937f..4562e82c3722ea8fbb67367918ca170ad9e8a197 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -154,6 +154,39 @@ const ( // since the arena starts at address 0. _MaxMem = 1<<_MHeapMap_TotalBits - 1 + // memLimitBits is the maximum number of bits in a heap address. + // + // On 64-bit platforms, we limit this to 48 bits because that + // is the maximum supported by Linux across all 64-bit + // architectures, with the exception of s390x. + // s390x supports full 64-bit addresses, but the allocator + // will panic in the unlikely event we exceed 48 bits. + // + // On 32-bit platforms, we accept the full 32-bit address + // space because doing so is cheap. + // mips32 only has access to the low 2GB of virtual memory, so + // we further limit it to 31 bits. + // + // The size of the arena index is proportional to + // 1<<memLimitBits, so it's important that this not be too + // large. 48 bits is about the threshold; above that we would + // need to go to a two level arena index. + memLimitBits = _64bit*48 + (1-_64bit)*(32-(sys.GoarchMips+sys.GoarchMipsle)) + + // memLimit is one past the highest possible heap pointer value. + memLimit = 1 << memLimitBits + + // heapArenaBytes is the size of a heap arena. The heap + // consists of mappings of size heapArenaBytes, aligned to + // heapArenaBytes. The initial heap mapping is one arena. + // + // TODO: Right now only the bitmap is divided into separate + // arenas, but shortly all of the heap will be. + heapArenaBytes = (64<<20)*_64bit + (4<<20)*(1-_64bit) + + // heapArenaBitmapBytes is the size of each heap arena's bitmap. + heapArenaBitmapBytes = heapArenaBytes / (sys.PtrSize * 8 / 2) + // Max number of threads to run garbage collection. // 2, 3, and 4 are all plausible maximums depending // on the hardware details of the machine. The garbage @@ -221,6 +254,12 @@ func mallocinit() { testdefersizes() + if heapArenaBitmapBytes&(heapArenaBitmapBytes-1) != 0 { + // heapBits expects modular arithmetic on bitmap + // addresses to work. + throw("heapArenaBitmapBytes not a power of 2") + } + // Copy class sizes out for statistics table. for i := range class_to_size { memstats.by_size[i].size = uint32(class_to_size[i]) @@ -248,9 +287,6 @@ func mallocinit() { // The spans array holds one *mspan per _PageSize of arena. var spansSize uintptr = (_MaxMem + 1) / _PageSize * sys.PtrSize spansSize = round(spansSize, _PageSize) - // The bitmap holds 2 bits per word of arena. - var bitmapSize uintptr = (_MaxMem + 1) / (sys.PtrSize * 8 / 2) - bitmapSize = round(bitmapSize, _PageSize) // Set up the allocation arena, a contiguous area of memory where // allocated data will be found. @@ -275,9 +311,6 @@ func mallocinit() { // not collecting memory because some non-pointer block of memory // had a bit pattern that matched a memory address. // - // Actually we reserve 544 GB (because the bitmap ends up being 32 GB) - // but it hardly matters: e0 00 is not valid UTF-8 either. - // // If this fails we fall back to the 32 bit memory mechanism // // However, on arm64, we ignore all this advice above and slam the @@ -285,7 +318,7 @@ func mallocinit() { // translation buffers, the user address space is limited to 39 bits // On darwin/arm64, the address space is even smaller. arenaSize := round(_MaxMem, _PageSize) - pSize = bitmapSize + spansSize + arenaSize + _PageSize + pSize = spansSize + arenaSize + _PageSize for i := 0; i <= 0x7f; i++ { switch { case GOARCH == "arm64" && GOOS == "darwin": @@ -344,7 +377,7 @@ func mallocinit() { // away from the running binary image and then round up // to a MB boundary. p = round(firstmoduledata.end+(1<<18), 1<<20) - pSize = bitmapSize + spansSize + arenaSize + _PageSize + pSize = spansSize + arenaSize + _PageSize if p <= procBrk && procBrk < p+pSize { // Move the start above the brk, // leaving some room for future brk @@ -369,8 +402,6 @@ func mallocinit() { spansStart := p1 p1 += spansSize - mheap_.bitmap_start = p1 - p1 += bitmapSize if sys.PtrSize == 4 { // Set arena_start such that we can accept memory // reservations located anywhere in the 4GB virtual space. @@ -383,24 +414,18 @@ func mallocinit() { mheap_.arena_alloc = p1 mheap_.arena_reserved = reserved - // Pre-compute the value heapBitsForAddr can use to directly - // map a heap address to a bitmap address. The obvious - // computation is: - // - // bitp = bitmap_start + (addr - arena_start)/ptrSize/4 - // - // We can shuffle this to - // - // bitp = (bitmap_start - arena_start/ptrSize/4) + addr/ptrSize/4 - // - // bitmap_delta is the value of the first term. - mheap_.bitmap_delta = mheap_.bitmap_start - mheap_.arena_start/heapBitmapScale - if mheap_.arena_start&(_PageSize-1) != 0 { - println("bad pagesize", hex(p), hex(p1), hex(spansSize), hex(bitmapSize), hex(_PageSize), "start", hex(mheap_.arena_start)) + println("bad pagesize", hex(p), hex(p1), hex(spansSize), hex(_PageSize), "start", hex(mheap_.arena_start)) throw("misrounded allocation in mallocinit") } + // Map the arena index. Most of this will never be touched. + var untracked uint64 + mheap_.arenas = (*[memLimit / heapArenaBytes]*heapArena)(persistentalloc(unsafe.Sizeof(*mheap_.arenas), sys.PtrSize, &untracked)) + if mheap_.arenas == nil { + throw("failed to allocate arena index") + } + // Initialize the rest of the allocator. mheap_.init(spansStart, spansSize) _g_ := getg() diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go index e4f6b52b880ed61616c4f7b5f5ed7c4ec44f5b21..5e109f59064e00d9241eea29fe161102d90f31d3 100644 --- a/src/runtime/mbitmap.go +++ b/src/runtime/mbitmap.go @@ -13,14 +13,12 @@ // // Heap bitmap // -// The allocated heap comes from a subset of the memory in the range [start, used), -// where start == mheap_.arena_start and used == mheap_.arena_used. -// The heap bitmap comprises 2 bits for each pointer-sized word in that range, -// stored in bytes indexed forward in memory from bitmap_start. -// That is, the byte at address bitmap holds the 2-bit entries for the -// four words start through start+3*ptrSize, the byte at -// bitmap_start+1 holds the entries for start+4*ptrSize through -// start+7*ptrSize, and so on. +// The heap bitmap comprises 2 bits for each pointer-sized word in the heap, +// stored in the heapArena metadata backing each heap arena. +// That is, if ha is the heapArena for the arena starting a start, +// then ha.bitmap[0] holds the 2-bit entries for the four words start +// through start+3*ptrSize, ha.bitmap[1] holds the entries for +// start+4*ptrSize through start+7*ptrSize, and so on. // // In each 2-bit entry, the lower bit holds the same information as in the 1-bit // bitmaps: 0 means uninteresting and 1 means live pointer to be visited during GC. @@ -86,9 +84,8 @@ const ( bitPointer = 1 << 0 bitScan = 1 << 4 - heapBitsShift = 1 // shift offset between successive bitPointer or bitScan entries - heapBitmapScale = sys.PtrSize * (8 / 2) // number of data bytes described by one heap bitmap byte - wordsPerBitmapByte = 8 / 2 // heap words described by one bitmap byte + heapBitsShift = 1 // shift offset between successive bitPointer or bitScan entries + wordsPerBitmapByte = 8 / 2 // heap words described by one bitmap byte // all scan/pointer bits in a byte bitScanAll = bitScan | bitScan<<heapBitsShift | bitScan<<(2*heapBitsShift) | bitScan<<(3*heapBitsShift) @@ -137,28 +134,6 @@ func subtract1(p *byte) *byte { return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - 1)) } -// mapBits maps any additional bitmap memory needed for the new arena memory. -// -// Don't call this directly. Call mheap.setArenaUsed. -// -//go:nowritebarrier -func (h *mheap) mapBits(arena_used uintptr) { - // Caller has added extra mappings to the arena. - // Add extra mappings of bitmap words as needed. - // We allocate extra bitmap pieces in chunks of bitmapChunk. - const bitmapChunk = 8192 - - n := (arena_used - mheap_.arena_start) / heapBitmapScale - n = round(n, bitmapChunk) - n = round(n, physPageSize) - if h.bitmap_mapped >= n { - return - } - - sysMap(unsafe.Pointer(h.bitmap_start+h.bitmap_mapped), n-h.bitmap_mapped, h.arena_reserved, &memstats.gc_sys) - h.bitmap_mapped = n -} - // heapBits provides access to the bitmap bits for a single heap word. // The methods on heapBits take value receivers so that the compiler // can more easily inline calls to those methods and registerize the @@ -166,8 +141,14 @@ func (h *mheap) mapBits(arena_used uintptr) { type heapBits struct { bitp *uint8 shift uint32 + arena uint32 // Index of heap arena containing bitp + last *uint8 // Last byte arena's bitmap } +// Make the compiler check that heapBits.arena is large enough to hold +// the maximum arena index. +var _ = heapBits{arena: memLimit / heapArenaBytes} + // markBits provides access to the mark bit for an object in the heap. // bytep points to the byte holding the mark bit. // mask is a byte with a single bit set that can be &ed with *bytep @@ -349,14 +330,26 @@ func (m *markBits) advance() { } // heapBitsForAddr returns the heapBits for the address addr. -// The caller must have already checked that addr is in the range [mheap_.arena_start, mheap_.arena_used). +// The caller must ensure addr is in an allocated span. +// In particular, be careful not to point past the end of an object. // // nosplit because it is used during write barriers and must not be preempted. //go:nosplit func heapBitsForAddr(addr uintptr) heapBits { // 2 bits per word, 4 pairs per byte, and a mask is hard coded. off := addr / sys.PtrSize - return heapBits{(*uint8)(unsafe.Pointer(mheap_.bitmap_delta + off/4)), uint32(off & 3)} + arena := addr / heapArenaBytes + ha := mheap_.arenas[arena] + // The compiler uses a load for nil checking ha, but in this + // case we'll almost never hit that cache line again, so it + // makes more sense to do a value check. + if ha == nil { + // addr is not in the heap. Crash without inhibiting inlining. + _ = *ha + } + bitp := &ha.bitmap[(off/4)%heapArenaBitmapBytes] + last := &ha.bitmap[len(ha.bitmap)-1] + return heapBits{bitp, uint32(off & 3), uint32(arena), last} } // heapBitsForSpan returns the heapBits for the span base address base. @@ -446,9 +439,24 @@ func findObject(p, refBase, refOff uintptr) (base uintptr, s *mspan, objIndex ui //go:nosplit func (h heapBits) next() heapBits { if h.shift < 3*heapBitsShift { - return heapBits{h.bitp, h.shift + heapBitsShift} + h.shift += heapBitsShift + } else if h.bitp != h.last { + h.bitp, h.shift = add1(h.bitp), 0 + } else { + // Move to the next arena. + h.arena++ + a := mheap_.arenas[h.arena] + if a == nil { + // We just passed the end of the object, which + // was also the end of the heap. Poison h. It + // should never be dereferenced at this point. + h.bitp, h.last = nil, nil + } else { + h.bitp, h.shift = &a.bitmap[0], 0 + h.last = &a.bitmap[len(a.bitmap)-1] + } } - return heapBits{add1(h.bitp), 0} + return h } // forward returns the heapBits describing n pointer-sized words ahead of h in memory. @@ -456,16 +464,37 @@ func (h heapBits) next() heapBits { // h.forward(1) is equivalent to h.next(), just slower. // Note that forward does not modify h. The caller must record the result. // bits returns the heap bits for the current word. +//go:nosplit func (h heapBits) forward(n uintptr) heapBits { n += uintptr(h.shift) / heapBitsShift - return heapBits{addb(h.bitp, n/4), uint32(n%4) * heapBitsShift} + nbitp := uintptr(unsafe.Pointer(h.bitp)) + n/4 + h.shift = uint32(n%4) * heapBitsShift + if nbitp <= uintptr(unsafe.Pointer(h.last)) { + h.bitp = (*uint8)(unsafe.Pointer(nbitp)) + return h + } + + // We're in a new heap arena. + past := nbitp - (uintptr(unsafe.Pointer(h.last)) + 1) + h.arena += 1 + uint32(past/heapArenaBitmapBytes) + a := mheap_.arenas[h.arena] + if a == nil { + h.bitp, h.last = nil, nil + } else { + h.bitp = &a.bitmap[past%heapArenaBitmapBytes] + h.last = &a.bitmap[len(a.bitmap)-1] + } + return h } // forwardOrBoundary is like forward, but stops at boundaries between // contiguous sections of the bitmap. It returns the number of words // advanced over, which will be <= n. func (h heapBits) forwardOrBoundary(n uintptr) (heapBits, uintptr) { - // The bitmap is contiguous right now, so this is just forward. + maxn := 4 * ((uintptr(unsafe.Pointer(h.last)) + 1) - uintptr(unsafe.Pointer(h.bitp))) + if n > maxn { + n = maxn + } return h.forward(n), n } @@ -951,6 +980,16 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) { // This is a lot of lines of code, but it compiles into relatively few // machine instructions. + outOfPlace := false + if (x+size-1)/heapArenaBytes != uintptr(h.arena) { + // This object spans heap arenas, so the bitmap may be + // discontiguous. Unroll it into the object instead + // and then copy it out. + outOfPlace = true + h.bitp = (*uint8)(unsafe.Pointer(x)) + h.last = nil + } + var ( // Ptrmask input. p *byte // last ptrmask byte read @@ -989,9 +1028,8 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) { } ptrmask = debugPtrmask.data runGCProg(addb(typ.gcdata, 4), nil, ptrmask, 1) - goto Phase4 } - return + goto Phase4 } // Note about sizes: @@ -1109,7 +1147,7 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) { nw = 2 } - // Phase 1: Special case for leading byte (shift==0) or half-byte (shift==4). + // Phase 1: Special case for leading byte (shift==0) or half-byte (shift==2). // The leading byte is special because it contains the bits for word 1, // which does not have the scan bit set. // The leading half-byte is special because it's a half a byte, @@ -1280,9 +1318,81 @@ Phase3: } Phase4: - // Phase 4: all done, but perhaps double check. + // Phase 4: Copy unrolled bitmap to per-arena bitmaps, if necessary. + if outOfPlace { + // TODO: We could probably make this faster by + // handling [x+dataSize, x+size) specially. + h := heapBitsForAddr(x) + // cnw is the number of heap words, or bit pairs + // remaining (like nw above). + cnw := size / sys.PtrSize + src := (*uint8)(unsafe.Pointer(x)) + // We know the first and last byte of the bitmap are + // not the same, but it's still possible for small + // objects span arenas, so it may share bitmap bytes + // with neighboring objects. + // + // Handle the first byte specially if it's shared. See + // Phase 1 for why this is the only special case we need. + if doubleCheck { + if !(h.shift == 0 || (sys.PtrSize == 8 && h.shift == 2)) { + print("x=", x, " size=", size, " cnw=", h.shift, "\n") + throw("bad start shift") + } + } + if sys.PtrSize == 8 && h.shift == 2 { + *hbitp = *hbitp&^((bitPointer|bitScan|(bitPointer|bitScan)<<heapBitsShift)<<(2*heapBitsShift)) | *src + h = h.next().next() + cnw -= 2 + src = addb(src, 1) + } + // We're now byte aligned. Copy out to per-arena + // bitmaps until the last byte (which may again be + // partial). + for cnw >= 4 { + hNext, words := h.forwardOrBoundary(cnw) + + // n is the number of bitmap bytes to copy. + n := words / 4 + memmove(unsafe.Pointer(h.bitp), unsafe.Pointer(src), n) + cnw -= words + h = hNext + src = addb(src, n) + } + // Handle the last byte if it's shared. + if cnw == 2 { + *h.bitp = *h.bitp&^(bitPointer|bitScan|(bitPointer|bitScan)<<heapBitsShift) | *src + src = addb(src, 1) + h = h.next().next() + } + if doubleCheck { + if uintptr(unsafe.Pointer(src)) > x+size { + throw("copy exceeded object size") + } + if !(cnw == 0 || cnw == 2) { + print("x=", x, " size=", size, " cnw=", cnw, "\n") + throw("bad number of remaining words") + } + // Set up hbitp so doubleCheck code below can check it. + hbitp = h.bitp + } + // Zero the object where we wrote the bitmap. + memclrNoHeapPointers(unsafe.Pointer(x), uintptr(unsafe.Pointer(src))-x) + } + + // Double check the whole bitmap. if doubleCheck { - end := heapBitsForAddr(x + size) + // x+size may not point to the heap, so back up one + // word and then call next(). + end := heapBitsForAddr(x + size - sys.PtrSize).next() + if !outOfPlace && (end.bitp == nil || (end.shift == 0 && end.bitp == &mheap_.arenas[end.arena].bitmap[0])) { + // The unrolling code above walks hbitp just + // past the bitmap without moving to the next + // arena. Synthesize this for end.bitp. + end.bitp = addb(&mheap_.arenas[end.arena-1].bitmap[0], heapArenaBitmapBytes) + end.arena-- + end.last = nil + } if typ.kind&kindGCProg == 0 && (hbitp != end.bitp || (w == nw+2) != (end.shift == 2)) { println("ended at wrong bitmap byte for", typ.string(), "x", dataSize/typ.size) print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n") @@ -1322,7 +1432,7 @@ Phase4: if have != want { println("mismatch writing bits for", typ.string(), "x", dataSize/typ.size) print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n") - print("kindGCProg=", typ.kind&kindGCProg != 0, "\n") + print("kindGCProg=", typ.kind&kindGCProg != 0, " outOfPlace=", outOfPlace, "\n") print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n") h0 := heapBitsForAddr(x) print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n") @@ -1430,7 +1540,7 @@ func heapBitsSetTypeGCProg(h heapBits, progSize, elemSize, dataSize, allocSize u totalBits = (elemSize*(count-1) + progSize) / sys.PtrSize } endProg := unsafe.Pointer(addb(h.bitp, (totalBits+3)/4)) - endAlloc := unsafe.Pointer(addb(h.bitp, allocSize/heapBitmapScale)) + endAlloc := unsafe.Pointer(addb(h.bitp, allocSize/sys.PtrSize/wordsPerBitmapByte)) memclrNoHeapPointers(endProg, uintptr(endAlloc)-uintptr(endProg)) } diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go index 737161dfeee490b3053eda2a51732329732fe3f2..eb9418f0dbf1d732ae27290b61c156dd2ce494ec 100644 --- a/src/runtime/mheap.go +++ b/src/runtime/mheap.go @@ -114,9 +114,6 @@ type mheap struct { nsmallfree [_NumSizeClasses]uint64 // number of frees for small objects (<=maxsmallsize) // range of addresses we might see in the heap - bitmap_start uintptr // Points to first byte of bitmap - bitmap_mapped uintptr - bitmap_delta uintptr // Used to map heap address to bitmap address // The arena_* fields indicate the addresses of the Go heap. // @@ -143,6 +140,21 @@ type mheap struct { // here and *must* clobber it to use it. arena_reserved bool + // arenas is the heap arena index. arenas[va/heapArenaBytes] + // points to the metadata for the heap arena containing va. + // + // For regions of the address space that are not backed by the + // Go heap, the arena index contains nil. + // + // Modifications are protected by mheap_.lock. Reads can be + // performed without locking; however, a given entry can + // transition from nil to non-nil at any time when the lock + // isn't held. (Entries never transitions back to nil.) + // + // This structure is fully mapped by mallocinit, so it's safe + // to probe any index. + arenas *[memLimit / heapArenaBytes]*heapArena + //_ uint32 // ensure 64-bit alignment // central free lists for small size classes. @@ -167,6 +179,23 @@ type mheap struct { var mheap_ mheap +// A heapArena stores metadata for a heap arena. heapArenas are stored +// outside of the Go heap and accessed via the mheap_.arenas index. +// +// This gets allocated directly from the OS, so ideally it should be a +// multiple of the system page size. For example, avoid adding small +// fields. +// +//go:notinheap +type heapArena struct { + // bitmap stores the pointer/scalar bitmap for the words in + // this arena. See mbitmap.go for a description. Use the + // heapBits type to access this. + bitmap [heapArenaBitmapBytes]byte + + // TODO: Also store the spans map here. +} + // An MSpan is a run of pages. // // When a MSpan is in the heap free list, state == MSpanFree @@ -507,8 +536,21 @@ func (h *mheap) setArenaUsed(arena_used uintptr, racemap bool) { // avoids faults when other threads try access these regions immediately // after observing the change to arena_used. - // Map the bitmap. - h.mapBits(arena_used) + // Allocate heap arena metadata. + for ri := h.arena_used / heapArenaBytes; ri < (arena_used+heapArenaBytes-1)/heapArenaBytes; ri++ { + if h.arenas[ri] != nil { + continue + } + r := (*heapArena)(persistentalloc(unsafe.Sizeof(heapArena{}), sys.PtrSize, &memstats.gc_sys)) + if r == nil { + throw("runtime: out of memory allocating heap arena metadata") + } + // Store atomically just in case an object from the + // new heap arena becomes visible before the heap lock + // is released (which shouldn't happen, but there's + // little downside to this). + atomic.StorepNoWB(unsafe.Pointer(&h.arenas[ri]), unsafe.Pointer(r)) + } // Map spans array. h.mapSpans(arena_used)