Skip to content

Conversation

@rudyfly
Copy link
Collaborator

@rudyfly rudyfly commented May 9, 2018

Ⅰ. Describe what this PR did

Add lock in boltdb, in order to avoid pouchd panic when put/get boltdb
objects concurrently.

Ⅱ. Does this pull request fix one issue?

use shell script will cause pouchd panic

#!/bin/bash

op=$1

for i in `seq 1 10`
do
	case $op in
		create)
			pouch run -d --name multi-create-$i -v /mnt/dir1 -v /mnt/dir2 -v /mnt/dir3 -v /mnt/dir4 -v /mnt/dir5 -v /mnt/dir6 -v /mnt/dir7 -v /mnt/dir8 -v /mnt/dir9 -v /mnt/dir10 registry.hub.docker.com/library/busybox:latest top &
			;;
		remove)
			pouch rm -vf multi-create-$i
			;;
		*)
			;;
	esac
done
# test.sh create

pouchd panic

unexpected fault address 0x7fc8fd450853
fatal error: fault
[signal SIGSEGV: segmentation violation code=0x1 addr=0x7fc8fd450853 pc=0xbf3adc]

goroutine 570 [running]:
runtime.throw(0x1b6bfeb, 0x5)
	/usr/local/go/src/runtime/panic.go:605 +0x95 fp=0xc4210d8548 sp=0xc4210d8528 pc=0xbc1275
runtime.sigpanic()
	/usr/local/go/src/runtime/signal_unix.go:374 +0x227 fp=0xc4210d8598 sp=0xc4210d8548 pc=0xbd8dc7
runtime.memmove(0xc420f23980, 0x7fc8fd450853, 0x40)
	/usr/local/go/src/runtime/memmove_amd64.s:184 +0x15c fp=0xc4210d85a0 sp=0xc4210d8598 pc=0xbf3adc
runtime.slicebytetostring(0x0, 0x7fc8fd450853, 0x40, 0x367, 0xc420f23980, 0x40)
	/usr/local/go/src/runtime/string.go:98 +0x79 fp=0xc4210d85d0 sp=0xc4210d85a0 pc=0xbdd829
encoding/json.(*decodeState).literalStore(0xc420f52ea0, 0x7fc8fd450852, 0x42, 0x368, 0x19270a0, 0xc420e8a240, 0x198, 0x185e900)
	/usr/local/go/src/encoding/json/decode.go:939 +0x1d16 fp=0xc4210d8940 sp=0xc4210d85d0 pc=0xc9ee36
encoding/json.(*decodeState).literal(0xc420f52ea0, 0x19270a0, 0xc420e8a240, 0x198)
	/usr/local/go/src/encoding/json/decode.go:802 +0xdb fp=0xc4210d8998 sp=0xc4210d8940 pc=0xc9ce4b
encoding/json.(*decodeState).value(0xc420f52ea0, 0x19270a0, 0xc420e8a240, 0x198)
	/usr/local/go/src/encoding/json/decode.go:408 +0x31e fp=0xc4210d8a18 sp=0xc4210d8998 pc=0xc99e8e
encoding/json.(*decodeState).object(0xc420f52ea0, 0x1b32c00, 0xc420e8a240, 0x16)
	/usr/local/go/src/encoding/json/decode.go:736 +0x1284 fp=0xc4210d8c68 sp=0xc4210d8a18 pc=0xc9c404
encoding/json.(*decodeState).value(0xc420f52ea0, 0x1b32c00, 0xc420e8a240, 0x16)
	/usr/local/go/src/encoding/json/decode.go:405 +0x2e4 fp=0xc4210d8ce8 sp=0xc4210d8c68 pc=0xc99e54
encoding/json.(*decodeState).unmarshal(0xc420f52ea0, 0x1b32c00, 0xc420e8a240, 0x0, 0x0)
	/usr/local/go/src/encoding/json/decode.go:187 +0x20e fp=0xc4210d8d60 sp=0xc4210d8ce8 pc=0xc992de
encoding/json.Unmarshal(0x7fc8fd45084a, 0x370, 0x370, 0x1b32c00, 0xc420e8a240, 0x7fc8fd45084a, 0x370)
	/usr/local/go/src/encoding/json/decode.go:107 +0x148 fp=0xc4210d8da8 sp=0xc4210d8d60 pc=0xc98ca8
github.com/alibaba/pouch/pkg/meta.(*Store).Get(0xc420419f20, 0xc420e49bc0, 0x40, 0xc420e3be00, 0x1, 0x0, 0xc4210d8eb0)
	/tmp/pouchbuild/src/github.com/alibaba/pouch/pkg/meta/store.go:166 +0x10b fp=0xc4210d8e40 sp=0xc4210d8da8 pc=0x12cbb7b
github.com/alibaba/pouch/storage/volume.(*Core).GetVolume(0xc4201184d0, 0xc420e49bc0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1a7daa0, 0x1927001, ...)
	/tmp/pouchbuild/src/github.com/alibaba/pouch/storage/volume/core.go:80 +0x9d fp=0xc4210d8f38 sp=0xc4210d8e40 pc=0x12f569d
github.com/alibaba/pouch/daemon/mgr.(*VolumeManager).Get(0xc4203100a0, 0x2506340, 0xc42044ff00, 0xc420e49bc0, 0x40, 0x0, 0xc420e50700, 0x7fc8fd5a2458)
	/tmp/pouchbuild/src/github.com/alibaba/pouch/daemon/mgr/volume.go:137 +0xb5 fp=0xc4210d8ff8 sp=0xc4210d8f38 pc=0x17aabf5
github.com/alibaba/pouch/daemon/mgr.(*VolumeManager).Attach(0xc4203100a0, 0x2506340, 0xc42044ff00, 0xc420e49bc0, 0x40, 0xc420e1c510, 0x0, 0xc420964f90, 0x0)
	/tmp/pouchbuild/src/github.com/alibaba/pouch/daemon/mgr/volume.go:161 +0xeb fp=0xc4210d9248 sp=0xc4210d8ff8 pc=0x17aaf6b
github.com/alibaba/pouch/daemon/mgr.(*ContainerManager).attachVolume(0xc42025eab0, 0x2506340, 0xc42044ff00, 0xc420e49bc0, 0x40, 0xc420ad9380, 0x0, 0x53, 0x1b6c220, 0x5, ...)
	/tmp/pouchbuild/src/github.com/alibaba/pouch/daemon/mgr/container.go:1798 +0x489 fp=0xc4210d93a0 sp=0xc4210d9248 pc=0x1773989
github.com/alibaba/pouch/daemon/mgr.(*ContainerManager).getMountPointFromBinds(0xc42025eab0, 0x2506340, 0xc42044ff00, 0xc420ad9380, 0xc420907ef0, 0x0, 0x0)
	/tmp/pouchbuild/src/github.com/alibaba/pouch/daemon/mgr/container.go:1915 +0xb54 fp=0xc4210d95d0 sp=0xc4210d93a0 pc=0x1774e04
github.com/alibaba/pouch/daemon/mgr.(*ContainerManager).generateMountPoints(0xc42025eab0, 0x2506340, 0xc42044ff00, 0xc420ad9380, 0x0, 0x0)
	/tmp/pouchbuild/src/github.com/alibaba/pouch/daemon/mgr/container.go:1844 +0x1ed fp=0xc4210d9638 sp=0xc4210d95d0 pc=0x1773fcd
github.com/alibaba/pouch/daemon/mgr.(*ContainerManager).Create(0xc42025eab0, 0x2506340, 0xc42044ff00, 0xc42042d0e3, 0xe, 0xc420ad8b60, 0x0, 0x0, 0x0)
	/tmp/pouchbuild/src/github.com/alibaba/pouch/daemon/mgr/container.go:454 +0xd6f fp=0xc4210d9928 sp=0xc4210d9638 pc=0x1766dbf
github.com/alibaba/pouch/apis/server.(*Server).createContainer(0xc4204504a0, 0x2506340, 0xc42044ff00, 0x2505140, 0xc4203ea000, 0xc4202f6600, 0x3, 0x3)
	/tmp/pouchbuild/src/github.com/alibaba/pouch/apis/server/container_bridge.go:172 +0x2e9 fp=0xc4210d9a20 sp=0xc4210d9928 pc=0x17f5be9
github.com/alibaba/pouch/apis/server.(*Server).(github.com/alibaba/pouch/apis/server.createContainer)-fm(0x2506340, 0xc42044ff00, 0x2505140, 0xc4203ea000, 0xc4202f6600, 0xc4202f6600, 0xc4210e1ad0)
	/tmp/pouchbuild/src/github.com/alibaba/pouch/apis/server/router.go:36 +0x5c fp=0xc4210d9a70 sp=0xc4210d9a20 pc=0x18021ac
github.com/alibaba/pouch/apis/server.filter.func1(0x2505140, 0xc4203ea000, 0xc4202f6600)
	/tmp/pouchbuild/src/github.com/alibaba/pouch/apis/server/router.go:149 +0x327 fp=0xc4210d9bf0 sp=0xc4210d9a70 pc=0x18017f7
net/http.HandlerFunc.ServeHTTP(0xc420998f90, 0x2505140, 0xc4203ea000, 0xc4202f6600)
	/usr/local/go/src/net/http/server.go:1918 +0x44 fp=0xc4210d9c18 sp=0xc4210d9bf0 pc=0xe37c84
github.com/alibaba/pouch/vendor/github.com/gorilla/mux.(*Router).ServeHTTP(0xc42096f4a0, 0x2505140, 0xc4203ea000, 0xc4202f6600)
	/tmp/pouchbuild/src/github.com/alibaba/pouch/vendor/github.com/gorilla/mux/mux.go:133 +0xed fp=0xc4210d9d18 sp=0xc4210d9c18 pc=0x173a44d
net/http.serverHandler.ServeHTTP(0xc420469110, 0x2505140, 0xc4203ea000, 0xc4202f6400)
	/usr/local/go/src/net/http/server.go:2619 +0xb4 fp=0xc4210d9d48 sp=0xc4210d9d18 pc=0xe3a964
net/http.(*conn).serve(0xc420229ea0, 0x2506340, 0xc42044fd40)
	/usr/local/go/src/net/http/server.go:1801 +0x71d fp=0xc4210d9fc8 sp=0xc4210d9d48 pc=0xe36b5d
runtime.goexit()
	/usr/local/go/src/runtime/asm_amd64.s:2337 +0x1 fp=0xc4210d9fd0 sp=0xc4210d9fc8 pc=0xbf31d1
created by net/http.(*Server).Serve
	/usr/local/go/src/net/http/server.go:2720 +0x288

Ⅲ. Describe how you did it

Add lock when put/get boltdb

Ⅳ. Describe how to verify it

use shell script to check pouchd panic or not.

Ⅴ. Special notes for reviews

Signed-off-by: Rudy Zhang [email protected]

Add lock in boltdb, in order to avoid pouchd panic when put/get boltdb
objects concurrently.

Signed-off-by: Rudy Zhang <[email protected]>
@pouchrobot pouchrobot added kind/bug This is bug report for project size/S labels May 9, 2018
@codecov-io
Copy link

Codecov Report

Merging #1286 into master will increase coverage by 0.31%.
The diff coverage is 100%.

Impacted file tree graph

@@            Coverage Diff             @@
##           master    #1286      +/-   ##
==========================================
+ Coverage   15.31%   15.63%   +0.31%     
==========================================
  Files         174      173       -1     
  Lines       11014    10870     -144     
==========================================
+ Hits         1687     1699      +12     
+ Misses       9208     9052     -156     
  Partials      119      119
Impacted Files Coverage Δ
pkg/meta/boltdb.go 66.26% <100%> (+5.7%) ⬆️
daemon/config/config.go 7.89% <0%> (ø) ⬆️
ctrd/client.go 0% <0%> (ø) ⬆️
daemon/mgr/system.go 0% <0%> (ø) ⬆️
ctrd/client_opts.go
pkg/utils/utils.go 77.03% <0%> (+7.23%) ⬆️


type bolt struct {
db *boltdb.DB
sync.Mutex
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am wondering if we should add this sync.Mutex, since I found that in the definition of bolt.DB, there are lock as well. Does it conflict? Please check for this.

type DB struct {
	// When enabled, the database will perform a Check() after every commit.
	// A panic is issued if the database is in an inconsistent state. This
	// flag has a large performance impact so it should only be used for
	// debugging purposes.
	StrictMode bool

	// Setting the NoSync flag will cause the database to skip fsync()
	// calls after each commit. This can be useful when bulk loading data
	// into a database and you can restart the bulk load in the event of
	// a system failure or database corruption. Do not set this flag for
	// normal use.
	//
	// If the package global IgnoreNoSync constant is true, this value is
	// ignored.  See the comment on that constant for more details.
	//
	// THIS IS UNSAFE. PLEASE USE WITH CAUTION.
	NoSync bool

	// When true, skips the truncate call when growing the database.
	// Setting this to true is only safe on non-ext3/ext4 systems.
	// Skipping truncation avoids preallocation of hard drive space and
	// bypasses a truncate() and fsync() syscall on remapping.
	//
	// https://github.com/boltdb/bolt/issues/284
	NoGrowSync bool

	// If you want to read the entire database fast, you can set MmapFlag to
	// syscall.MAP_POPULATE on Linux 2.6.23+ for sequential read-ahead.
	MmapFlags int

	// MaxBatchSize is the maximum size of a batch. Default value is
	// copied from DefaultMaxBatchSize in Open.
	//
	// If <=0, disables batching.
	//
	// Do not change concurrently with calls to Batch.
	MaxBatchSize int

	// MaxBatchDelay is the maximum delay before a batch starts.
	// Default value is copied from DefaultMaxBatchDelay in Open.
	//
	// If <=0, effectively disables batching.
	//
	// Do not change concurrently with calls to Batch.
	MaxBatchDelay time.Duration

	// AllocSize is the amount of space allocated when the database
	// needs to create new pages. This is done to amortize the cost
	// of truncate() and fsync() when growing the data file.
	AllocSize int

	path     string
	file     *os.File
	lockfile *os.File // windows only
	dataref  []byte   // mmap'ed readonly, write throws SEGV
	data     *[maxMapSize]byte
	datasz   int
	filesz   int // current on disk file size
	meta0    *meta
	meta1    *meta
	pageSize int
	opened   bool
	rwtx     *Tx
	txs      []*Tx
	freelist *freelist
	stats    Stats

	pagePool sync.Pool

	batchMu sync.Mutex
	batch   *batch

	rwlock   sync.Mutex   // Allows only one writer at a time.
	metalock sync.Mutex   // Protects meta page access.
	mmaplock sync.RWMutex // Protects mmap access during remapping.
	statlock sync.RWMutex // Protects stats access.

	ops struct {
		writeAt func(b []byte, off int64) (n int, err error)
	}

	// Read only mode.
	// When true, Update() and Begin(true) return ErrDatabaseReadOnly immediately.
	readOnly bool
}

@allencloud
Copy link
Collaborator

LGTM

@pouchrobot pouchrobot added the LGTM one maintainer or community participant agrees to merge the pull reuqest. label May 10, 2018
@allencloud allencloud merged commit 49302b7 into AliyunContainerService:master May 10, 2018
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

kind/bug This is bug report for project LGTM one maintainer or community participant agrees to merge the pull reuqest. size/S

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants