diff --git a/src/cmd/internal/obj/wasm/a.out.go b/src/cmd/internal/obj/wasm/a.out.go
index c686f1d6f0ec086a4a78ea24f2f509215128cf07..823777d4fbca25fec8beb49bb7adbc6802dc3466 100644
--- a/src/cmd/internal/obj/wasm/a.out.go
+++ b/src/cmd/internal/obj/wasm/a.out.go
@@ -250,9 +250,7 @@ const (
 
 const (
 	// globals
-	REG_PC_F = obj.RBaseWasm + iota
-	REG_PC_B
-	REG_SP // SP is currently 32-bit, until 64-bit memory operations are available
+	REG_SP = obj.RBaseWasm + iota // SP is currently 32-bit, until 64-bit memory operations are available
 	REG_CTXT
 	REG_g
 	// RET* are used by runtime.return0 and runtime.reflectcall. These functions pass return values in registers.
@@ -296,9 +294,11 @@ const (
 	REG_F14
 	REG_F15
 
+	REG_PC_B // also first parameter, i32
+
 	MAXREG
 
-	MINREG  = REG_PC_F
+	MINREG  = REG_SP
 	REGSP   = REG_SP
 	REGCTXT = REG_CTXT
 	REGG    = REG_g
diff --git a/src/cmd/internal/obj/wasm/wasmobj.go b/src/cmd/internal/obj/wasm/wasmobj.go
index 0ad883470ef253760de376b2a952240b053f7efd..a6388b9ee7e3cbc05240200efda1aaa1012f8f40 100644
--- a/src/cmd/internal/obj/wasm/wasmobj.go
+++ b/src/cmd/internal/obj/wasm/wasmobj.go
@@ -16,8 +16,6 @@ import (
 )
 
 var Register = map[string]int16{
-	"PC_F":  REG_PC_F,
-	"PC_B":  REG_PC_B,
 	"SP":    REG_SP,
 	"CTXT":  REG_CTXT,
 	"g":     REG_g,
@@ -60,6 +58,8 @@ var Register = map[string]int16{
 	"F13": REG_F13,
 	"F14": REG_F14,
 	"F15": REG_F15,
+
+	"PC_B": REG_PC_B,
 }
 
 var registerNames []string
@@ -368,20 +368,31 @@ func preprocess(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
 				break
 			}
 
-			// reset PC_B to function entry
-			p = appendp(p, AI32Const, constAddr(0))
-			p = appendp(p, ASet, regAddr(REG_PC_B))
-
 			// low-level WebAssembly call to function
 			switch jmp.To.Type {
 			case obj.TYPE_MEM:
+				if !notUsePC_B[jmp.To.Sym.Name] {
+					// Set PC_B parameter to function entry.
+					p = appendp(p, AI32Const, constAddr(0))
+				}
 				p = appendp(p, ACall, jmp.To)
+
 			case obj.TYPE_NONE:
 				// (target PC is on stack)
 				p = appendp(p, AI32WrapI64)
 				p = appendp(p, AI32Const, constAddr(16)) // only needs PC_F bits (16-31), PC_B bits (0-15) are zero
 				p = appendp(p, AI32ShrU)
+
+				// Set PC_B parameter to function entry.
+				// We need to push this before pushing the target PC_F,
+				// so temporarily pop PC_F, using our REG_PC_B as a
+				// scratch register, and push it back after pushing 0.
+				p = appendp(p, ASet, regAddr(REG_PC_B))
+				p = appendp(p, AI32Const, constAddr(0))
+				p = appendp(p, AGet, regAddr(REG_PC_B))
+
 				p = appendp(p, ACallIndirect)
+
 			default:
 				panic("bad target for JMP")
 			}
@@ -419,20 +430,31 @@ func preprocess(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
 			})
 			p = appendp(p, AI64Store, constAddr(0))
 
-			// reset PC_B to function entry
-			p = appendp(p, AI32Const, constAddr(0))
-			p = appendp(p, ASet, regAddr(REG_PC_B))
-
 			// low-level WebAssembly call to function
 			switch call.To.Type {
 			case obj.TYPE_MEM:
+				if !notUsePC_B[call.To.Sym.Name] {
+					// Set PC_B parameter to function entry.
+					p = appendp(p, AI32Const, constAddr(0))
+				}
 				p = appendp(p, ACall, call.To)
+
 			case obj.TYPE_NONE:
 				// (target PC is on stack)
 				p = appendp(p, AI32WrapI64)
 				p = appendp(p, AI32Const, constAddr(16)) // only needs PC_F bits (16-31), PC_B bits (0-15) are zero
 				p = appendp(p, AI32ShrU)
+
+				// Set PC_B parameter to function entry.
+				// We need to push this before pushing the target PC_F,
+				// so temporarily pop PC_F, using our PC_B as a
+				// scratch register, and push it back after pushing 0.
+				p = appendp(p, ASet, regAddr(REG_PC_B))
+				p = appendp(p, AI32Const, constAddr(0))
+				p = appendp(p, AGet, regAddr(REG_PC_B))
+
 				p = appendp(p, ACallIndirect)
+
 			default:
 				panic("bad target for CALL")
 			}
@@ -465,7 +487,13 @@ func preprocess(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
 
 			// jump to before the call if jmpdefer has reset the return address to the call's PC
 			if call.To.Sym == deferreturn {
-				p = appendp(p, AGet, regAddr(REG_PC_B))
+				// get PC_B from -8(SP)
+				p = appendp(p, AGet, regAddr(REG_SP))
+				p = appendp(p, AI32Const, constAddr(8))
+				p = appendp(p, AI32Sub)
+				p = appendp(p, AI32Load16U, constAddr(0))
+				p = appendp(p, ATee, regAddr(REG_PC_B))
+
 				p = appendp(p, AI32Const, constAddr(call.Pc))
 				p = appendp(p, AI32Eq)
 				p = appendp(p, ABrIf, constAddr(0))
@@ -487,9 +515,8 @@ func preprocess(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
 			}
 
 			if ret.To.Type == obj.TYPE_MEM {
-				// reset PC_B to function entry
+				// Set PC_B parameter to function entry.
 				p = appendp(p, AI32Const, constAddr(0))
-				p = appendp(p, ASet, regAddr(REG_PC_B))
 
 				// low-level WebAssembly call to function
 				p = appendp(p, ACall, ret.To)
@@ -497,16 +524,6 @@ func preprocess(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
 				break
 			}
 
-			// read return PC_F from Go stack
-			p = appendp(p, AGet, regAddr(REG_SP))
-			p = appendp(p, AI32Load16U, constAddr(2))
-			p = appendp(p, ASet, regAddr(REG_PC_F))
-
-			// read return PC_B from Go stack
-			p = appendp(p, AGet, regAddr(REG_SP))
-			p = appendp(p, AI32Load16U, constAddr(0))
-			p = appendp(p, ASet, regAddr(REG_PC_B))
-
 			// SP += 8
 			p = appendp(p, AGet, regAddr(REG_SP))
 			p = appendp(p, AI32Const, constAddr(8))
@@ -771,16 +788,38 @@ func countRegisters(s *obj.LSym) (numI, numF int16) {
 	return
 }
 
+// Most of the Go functions has a single parameter (PC_B) in
+// Wasm ABI. This is a list of exceptions.
+var notUsePC_B = map[string]bool{
+	"_rt0_wasm_js":           true,
+	"wasm_export_run":        true,
+	"wasm_export_resume":     true,
+	"wasm_export_getsp":      true,
+	"wasm_pc_f_loop":         true,
+	"runtime.wasmMove":       true,
+	"runtime.wasmZero":       true,
+	"runtime.wasmDiv":        true,
+	"runtime.wasmTruncS":     true,
+	"runtime.wasmTruncU":     true,
+	"runtime.gcWriteBarrier": true,
+	"cmpbody":                true,
+	"memeqbody":              true,
+	"memcmp":                 true,
+	"memchr":                 true,
+}
+
 func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
 	w := new(bytes.Buffer)
 
 	hasLocalSP := false
+	hasPC_B := false
 	var r0, f0 int16
 
 	// Function starts with declaration of locals: numbers and types.
 	// Some functions use a special calling convention.
 	switch s.Name {
-	case "wasm_export_run", "runtime.wasmMove", "runtime.wasmZero", "runtime.wasmDiv", "runtime.wasmTruncS", "runtime.wasmTruncU", "memeqbody":
+	case "_rt0_wasm_js", "wasm_export_run", "wasm_export_resume", "wasm_export_getsp", "wasm_pc_f_loop",
+		"runtime.wasmMove", "runtime.wasmZero", "runtime.wasmDiv", "runtime.wasmTruncS", "runtime.wasmTruncU", "memeqbody":
 		writeUleb128(w, 0) // number of sets of locals
 	case "memchr", "memcmp":
 		writeUleb128(w, 1) // number of sets of locals
@@ -797,9 +836,10 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
 	default:
 		// Normal calling convention: No WebAssembly parameters. First local variable is local SP cache.
 		hasLocalSP = true
+		hasPC_B = true
 		numI, numF := countRegisters(s)
-		r0 = 1
-		f0 = 1 + numI
+		r0 = 2
+		f0 = 2 + numI
 
 		numTypes := 1
 		if numI > 0 {
@@ -826,6 +866,7 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
 		// Copy SP from its global variable into a local variable. Accessing a local variable is more efficient.
 		updateLocalSP(w)
 	}
+
 	for p := s.Func.Text; p != nil; p = p.Link {
 		switch p.As {
 		case AGet:
@@ -836,10 +877,16 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
 			switch {
 			case reg == REG_SP && hasLocalSP:
 				w.WriteByte(0x20)  // local.get
-				writeUleb128(w, 0) // local SP
-			case reg >= REG_PC_F && reg <= REG_PAUSE:
+				writeUleb128(w, 1) // local SP
+			case reg >= REG_SP && reg <= REG_PAUSE:
 				w.WriteByte(0x23) // global.get
-				writeUleb128(w, uint64(reg-REG_PC_F))
+				writeUleb128(w, uint64(reg-REG_SP))
+			case reg == REG_PC_B:
+				if !hasPC_B {
+					panic(fmt.Sprintf("PC_B is not used in %s", s.Name))
+				}
+				w.WriteByte(0x20)  // local.get (i32)
+				writeUleb128(w, 0) // local PC_B
 			case reg >= REG_R0 && reg <= REG_R15:
 				w.WriteByte(0x20) // local.get (i64)
 				writeUleb128(w, uint64(r0+(reg-REG_R0)))
@@ -857,21 +904,26 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
 			}
 			reg := p.To.Reg
 			switch {
-			case reg >= REG_PC_F && reg <= REG_PAUSE:
+			case reg >= REG_SP && reg <= REG_PAUSE:
 				if reg == REG_SP && hasLocalSP {
 					w.WriteByte(0x22)  // local.tee
-					writeUleb128(w, 0) // local SP
+					writeUleb128(w, 1) // local SP
 				}
 				w.WriteByte(0x24) // global.set
-				writeUleb128(w, uint64(reg-REG_PC_F))
-			case reg >= REG_R0 && reg <= REG_F15:
+				writeUleb128(w, uint64(reg-REG_SP))
+			case reg >= REG_R0 && reg <= REG_PC_B:
 				if p.Link.As == AGet && p.Link.From.Reg == reg {
 					w.WriteByte(0x22) // local.tee
 					p = p.Link
 				} else {
 					w.WriteByte(0x21) // local.set
 				}
-				if reg <= REG_R15 {
+				if reg == REG_PC_B {
+					if !hasPC_B {
+						panic(fmt.Sprintf("PC_B is not used in %s", s.Name))
+					}
+					writeUleb128(w, 0) // local PC_B
+				} else if reg <= REG_R15 {
 					writeUleb128(w, uint64(r0+(reg-REG_R0)))
 				} else {
 					writeUleb128(w, uint64(f0+(reg-REG_F0)))
@@ -887,6 +939,12 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
 			}
 			reg := p.To.Reg
 			switch {
+			case reg == REG_PC_B:
+				if !hasPC_B {
+					panic(fmt.Sprintf("PC_B is not used in %s", s.Name))
+				}
+				w.WriteByte(0x22)  // local.tee (i32)
+				writeUleb128(w, 0) // local PC_B
 			case reg >= REG_R0 && reg <= REG_R15:
 				w.WriteByte(0x22) // local.tee (i64)
 				writeUleb128(w, uint64(r0+(reg-REG_R0)))
@@ -1036,10 +1094,10 @@ func assemble(ctxt *obj.Link, s *obj.LSym, newprog obj.ProgAlloc) {
 }
 
 func updateLocalSP(w *bytes.Buffer) {
-	w.WriteByte(0x23)                        // global.get
-	writeUleb128(w, uint64(REG_SP-REG_PC_F)) // SP
-	w.WriteByte(0x21)                        // local.set
-	writeUleb128(w, 0)                       // local SP
+	w.WriteByte(0x23)  // global.get
+	writeUleb128(w, 0) // global SP
+	w.WriteByte(0x21)  // local.set
+	writeUleb128(w, 1) // local SP
 }
 
 func align(as obj.As) uint64 {
diff --git a/src/cmd/link/internal/wasm/asm.go b/src/cmd/link/internal/wasm/asm.go
index c80e81e5b355fbf8b7d47686d9ef2e92b46daa8e..54b265cb19fdfa5118073d2d860755593c314293 100644
--- a/src/cmd/link/internal/wasm/asm.go
+++ b/src/cmd/link/internal/wasm/asm.go
@@ -102,10 +102,11 @@ func asmb2(ctxt *ld.Link) {
 	}
 
 	types := []*wasmFuncType{
-		// For normal Go functions the return value is
+		// For normal Go functions, the single parameter is PC_B,
+		// the return value is
 		// 0 if the function returned normally or
 		// 1 if the stack needs to be unwound.
-		{Results: []byte{I32}},
+		{Params: []byte{I32}, Results: []byte{I32}},
 	}
 
 	// collect host imports (functions that get imported from the WebAssembly host, usually JavaScript)
@@ -320,16 +321,14 @@ func writeGlobalSec(ctxt *ld.Link) {
 	sizeOffset := writeSecHeader(ctxt, sectionGlobal)
 
 	globalRegs := []byte{
-		I32, // 0: PC_F
-		I32, // 1: PC_B
-		I32, // 2: SP
-		I64, // 3: CTXT
-		I64, // 4: g
-		I64, // 5: RET0
-		I64, // 6: RET1
-		I64, // 7: RET2
-		I64, // 8: RET3
-		I32, // 9: PAUSE
+		I32, // 0: SP
+		I64, // 1: CTXT
+		I64, // 2: g
+		I64, // 3: RET0
+		I64, // 4: RET1
+		I64, // 5: RET2
+		I64, // 6: RET3
+		I32, // 7: PAUSE
 	}
 
 	writeUleb128(ctxt.Out, uint64(len(globalRegs))) // number of globals
diff --git a/src/runtime/asm_wasm.s b/src/runtime/asm_wasm.s
index a10c89d298bb27d783b63a199bfcd574d2afb42b..8f3964f08b4266ea48c449e6f53b6c842c8b57d2 100644
--- a/src/runtime/asm_wasm.s
+++ b/src/runtime/asm_wasm.s
@@ -37,17 +37,12 @@ TEXT runtime·gogo(SB), NOSPLIT, $0-8
 	MOVD gobuf_g(R0), g
 	MOVD gobuf_sp(R0), SP
 
+	// Put target PC at -8(SP), wasm_pc_f_loop will pick it up
+	Get SP
+	I32Const $8
+	I32Sub
 	I64Load gobuf_pc(R0)
-	I32WrapI64
-	I32Const $16
-	I32ShrU
-	Set PC_F
-
-	I64Load gobuf_pc(R0)
-	I64Const $0xFFFF
-	I64And
-	I32WrapI64
-	Set PC_B
+	I64Store $0
 
 	MOVD gobuf_ret(R0), RET0
 	MOVD gobuf_ctxt(R0), CTXT
diff --git a/src/runtime/rt0_js_wasm.s b/src/runtime/rt0_js_wasm.s
index c4efd9637c41cd45fa3474e616c82567951d1323..b22c46e2e95a9579f202f5111bb63898cbd67164 100644
--- a/src/runtime/rt0_js_wasm.s
+++ b/src/runtime/rt0_js_wasm.s
@@ -31,14 +31,9 @@ TEXT wasm_export_run(SB),NOSPLIT,$0
 	I64ExtendI32U
 	I64Store $8
 
-	I32Const $runtime·rt0_go(SB)
-	I32Const $16
-	I32ShrU
-	Set PC_F
-
-	I32Const $0
-	Set PC_B
-
+	I32Const $0 // entry PC_B
+	Call runtime·rt0_go(SB)
+	Drop
 	Call wasm_pc_f_loop(SB)
 
 	Return
@@ -46,14 +41,9 @@ TEXT wasm_export_run(SB),NOSPLIT,$0
 // wasm_export_resume gets called from JavaScript. It resumes the execution of Go code until it needs to wait for
 // an event.
 TEXT wasm_export_resume(SB),NOSPLIT,$0
-	I32Const $runtime·handleEvent(SB)
-	I32Const $16
-	I32ShrU
-	Set PC_F
-
 	I32Const $0
-	Set PC_B
-
+	Call runtime·handleEvent(SB)
+	Drop
 	Call wasm_pc_f_loop(SB)
 
 	Return
@@ -63,15 +53,30 @@ TEXT wasm_pc_f_loop(SB),NOSPLIT,$0
 // The WebAssembly stack may unwind, e.g. when switching goroutines.
 // The Go stack on the linear memory is then used to jump to the correct functions
 // with this loop, without having to restore the full WebAssembly stack.
-loop:
-	Loop
-		Get PC_F
-		CallIndirect $0
-		Drop
-
-		Get PAUSE
-		I32Eqz
-		BrIf loop
+// It is expected to have a pending call before entering the loop, so check PAUSE first.
+	Get PAUSE
+	I32Eqz
+	If
+	loop:
+		Loop
+			// Get PC_B & PC_F from -8(SP)
+			Get SP
+			I32Const $8
+			I32Sub
+			I32Load16U $0 // PC_B
+
+			Get SP
+			I32Const $8
+			I32Sub
+			I32Load16U $2 // PC_F
+
+			CallIndirect $0
+			Drop
+
+			Get PAUSE
+			I32Eqz
+			BrIf loop
+		End
 	End
 
 	I32Const $0
@@ -91,6 +96,7 @@ TEXT runtime·pause(SB), NOSPLIT, $0-8
 	RETUNWIND
 
 TEXT runtime·exit(SB), NOSPLIT, $0-4
+	I32Const $0
 	Call runtime·wasmExit(SB)
 	Drop
 	I32Const $1