-
Notifications
You must be signed in to change notification settings - Fork 228
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
client: Cache tikv request in tidb client side #1098
Changes from 15 commits
2d2427c
d249194
9a0739e
c68dbeb
7f3cc02
4f42240
8d1fca6
9e7283f
55e98de
e3a4cab
8e939c9
cabd454
709b464
38d9dfd
5766a91
3602ee7
2b9969c
66a3525
98824f1
2d3bbf0
1d876f1
c492f4e
e1cca1c
c85b218
d14af95
dbce3da
1ee7caa
bbe0543
94a0d9c
27aca2a
a01352e
f48967c
85f81ab
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -70,15 +70,15 @@ type batchCommandsEntry struct { | |
// canceled indicated the request is canceled or not. | ||
canceled int32 | ||
err error | ||
pri uint64 | ||
} | ||
|
||
func (b *batchCommandsEntry) isCanceled() bool { | ||
return atomic.LoadInt32(&b.canceled) == 1 | ||
} | ||
|
||
// TODO: implement by the request priority. | ||
func (b *batchCommandsEntry) priority() int { | ||
return 0 | ||
func (b *batchCommandsEntry) priority() uint64 { | ||
return b.pri | ||
} | ||
|
||
func (b *batchCommandsEntry) error(err error) { | ||
|
@@ -107,14 +107,14 @@ func (b *batchCommandsBuilder) push(entry *batchCommandsEntry) { | |
b.entries.Push(entry) | ||
} | ||
|
||
// build builds BatchCommandsRequests and calls collect() for each valid entry. | ||
// buildWithLimit builds BatchCommandsRequests and calls collect() for each valid entry. | ||
bufferflies marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// The first return value is the request that doesn't need forwarding. | ||
// The second is a map that maps forwarded hosts to requests. | ||
func (b *batchCommandsBuilder) build( | ||
collect func(id uint64, e *batchCommandsEntry), | ||
func (b *batchCommandsBuilder) buildWithLimit(limit int64, collect func(id uint64, e *batchCommandsEntry), | ||
) (*tikvpb.BatchCommandsRequest, map[string]*tikvpb.BatchCommandsRequest) { | ||
for _, entry := range b.entries.All() { | ||
e := entry.(*batchCommandsEntry) | ||
pri, pending := uint64(0), b.entries.Len() | ||
for count, i := int64(0), 0; i < pending; i++ { | ||
e := b.entries.Pop().(*batchCommandsEntry) | ||
if e.isCanceled() { | ||
continue | ||
} | ||
|
@@ -133,7 +133,15 @@ func (b *batchCommandsBuilder) build( | |
batchReq.RequestIds = append(batchReq.RequestIds, b.idAlloc) | ||
batchReq.Requests = append(batchReq.Requests, e.req) | ||
} | ||
if count == 0 { | ||
pri = e.priority() | ||
} | ||
count++ | ||
b.idAlloc++ | ||
// keep one batch for each priority, don't max different priority request into one batch requests. | ||
if count >= limit || e.priority() != pri { | ||
break | ||
} | ||
} | ||
var req *tikvpb.BatchCommandsRequest | ||
if len(b.requests) > 0 { | ||
|
@@ -145,20 +153,22 @@ func (b *batchCommandsBuilder) build( | |
return req, b.forwardingReqs | ||
} | ||
|
||
// cancel all requests, only used in test. | ||
func (b *batchCommandsBuilder) cancel(e error) { | ||
for _, entry := range b.entries.All() { | ||
entry.(*batchCommandsEntry).error(e) | ||
} | ||
b.entries.Reset() | ||
} | ||
|
||
// reset resets the builder to the initial state. | ||
// Should call it before collecting a new batch. | ||
func (b *batchCommandsBuilder) reset() { | ||
b.entries.clean() | ||
// NOTE: We can't simply set entries = entries[:0] here. | ||
// The data in the cap part of the slice would reference the prewrite keys whose | ||
// underlying memory is borrowed from memdb. The reference cause GC can't release | ||
// the memdb, leading to serious memory leak problems in the large transaction case. | ||
b.entries.Reset() | ||
for i := 0; i < len(b.requests); i++ { | ||
b.requests[i] = nil | ||
} | ||
|
@@ -336,8 +346,7 @@ func (a *batchConn) batchSendLoop(cfg config.TiKVClient) { | |
a.fetchMorePendingRequests(int(cfg.MaxBatchSize), int(bestBatchWaitSize), cfg.MaxBatchWaitTime) | ||
} | ||
} | ||
a.pendingRequests.Observe(float64(len(a.batchCommandsCh))) | ||
a.batchSize.Observe(float64(a.reqBuilder.len())) | ||
a.pendingRequests.Observe(float64(len(a.batchCommandsCh) + a.reqBuilder.len())) | ||
length := a.reqBuilder.len() | ||
if uint(length) == 0 { | ||
// The batch command channel is closed. | ||
|
@@ -349,12 +358,20 @@ func (a *batchConn) batchSendLoop(cfg config.TiKVClient) { | |
bestBatchWaitSize++ | ||
} | ||
|
||
a.getClientAndSend() | ||
batch := a.getClientAndSend() | ||
if batch != 0 { | ||
a.batchSize.Observe(float64(a.reqBuilder.len())) | ||
bufferflies marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
metrics.TiKVBatchSendLatency.Observe(float64(time.Since(start))) | ||
} | ||
} | ||
|
||
func (a *batchConn) getClientAndSend() { | ||
const ( | ||
SendFailedReasonNoAvailableLimit = "no available limit" | ||
SendFailedReasonTryLockForSendFail = "tryLockForSend fail" | ||
bufferflies marked this conversation as resolved.
Show resolved
Hide resolved
|
||
) | ||
|
||
func (a *batchConn) getClientAndSend() int { | ||
if val, err := util.EvalFailpoint("mockBatchClientSendDelay"); err == nil { | ||
if timeout, ok := val.(int); ok && timeout > 0 { | ||
time.Sleep(time.Duration(timeout * int(time.Millisecond))) | ||
|
@@ -366,37 +383,47 @@ func (a *batchConn) getClientAndSend() { | |
cli *batchCommandsClient | ||
target string | ||
) | ||
reason := "" | ||
for i := 0; i < len(a.batchCommandsClients); i++ { | ||
a.index = (a.index + 1) % uint32(len(a.batchCommandsClients)) | ||
target = a.batchCommandsClients[a.index].target | ||
// The lock protects the batchCommandsClient from been closed while it's in use. | ||
if a.batchCommandsClients[a.index].tryLockForSend() { | ||
cli = a.batchCommandsClients[a.index] | ||
break | ||
if c := a.batchCommandsClients[a.index]; c.tryLockForSend() { | ||
if c.sent.Load() <= c.maxConcurrencyRequestLimit.Load() { | ||
cli = c | ||
break | ||
} else { | ||
reason = SendFailedReasonNoAvailableLimit | ||
c.unlockForSend() | ||
bufferflies marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
} else { | ||
reason = SendFailedReasonTryLockForSendFail | ||
} | ||
} | ||
if cli == nil { | ||
logutil.BgLogger().Warn("no available connections", zap.String("target", target)) | ||
logutil.BgLogger().Warn("no available connections", zap.String("target", target), zap.String("reason", reason)) | ||
metrics.TiKVNoAvailableConnectionCounter.Inc() | ||
|
||
// Please ensure the error is handled in region cache correctly. | ||
a.reqBuilder.cancel(errors.New("no available connections")) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After this change these request are not canceled and retry sending them when new request arrives. So these request will be block if there is no new incoming requests, is it a proper behavior? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
yes, the request maybe block if there are any requests coming if the configuration is small. It will timeout and then retry it again. I will fixed it by notified mechanism.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For the first issue, we may handle it in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It maybe cause busy loop if there's no one client that has sent token. I will optimaze it by using channel to notify the sender to sent requests again. |
||
return | ||
return 0 | ||
} | ||
defer cli.unlockForSend() | ||
|
||
req, forwardingReqs := a.reqBuilder.build(func(id uint64, e *batchCommandsEntry) { | ||
available := cli.maxConcurrencyRequestLimit.Load() - cli.sent.Load() | ||
bufferflies marked this conversation as resolved.
Show resolved
Hide resolved
|
||
batch := 0 | ||
req, forwardingReqs := a.reqBuilder.buildWithLimit(available, func(id uint64, e *batchCommandsEntry) { | ||
cli.batched.Store(id, e) | ||
cli.sent.Add(1) | ||
if trace.IsEnabled() { | ||
trace.Log(e.ctx, "rpc", "send") | ||
} | ||
}) | ||
if req != nil { | ||
batch += len(req.RequestIds) | ||
cli.send("", req) | ||
} | ||
for forwardedHost, req := range forwardingReqs { | ||
batch += len(req.RequestIds) | ||
cli.send(forwardedHost, req) | ||
} | ||
return batch | ||
} | ||
|
||
type tryLock struct { | ||
|
@@ -507,6 +534,10 @@ type batchCommandsClient struct { | |
closed int32 | ||
// tryLock protects client when re-create the streaming. | ||
tryLock | ||
// sent is the counter of sent requests to tikv but not accept response. | ||
sent atomic.Int64 | ||
// limit is the max number of requests can be sent to tikv but not accept response. | ||
maxConcurrencyRequestLimit atomic.Int64 | ||
} | ||
|
||
func (c *batchCommandsClient) isStopped() bool { | ||
|
@@ -549,6 +580,7 @@ func (c *batchCommandsClient) failPendingRequests(err error) { | |
id, _ := key.(uint64) | ||
entry, _ := value.(*batchCommandsEntry) | ||
c.batched.Delete(id) | ||
c.sent.Add(-1) | ||
entry.error(err) | ||
return true | ||
}) | ||
|
@@ -661,6 +693,7 @@ func (c *batchCommandsClient) batchRecvLoop(cfg config.TiKVClient, tikvTransport | |
entry.res <- responses[i] | ||
} | ||
c.batched.Delete(requestID) | ||
c.sent.Add(-1) | ||
} | ||
|
||
transportLayerLoad := resp.GetTransportLayerLoad() | ||
|
@@ -779,6 +812,7 @@ func sendBatchRequest( | |
batchConn *batchConn, | ||
req *tikvpb.BatchCommandsRequest_Request, | ||
timeout time.Duration, | ||
priority uint64, | ||
) (*tikvrpc.Response, error) { | ||
entry := &batchCommandsEntry{ | ||
ctx: ctx, | ||
|
@@ -787,6 +821,7 @@ func sendBatchRequest( | |
forwardedHost: forwardedHost, | ||
canceled: 0, | ||
err: nil, | ||
pri: priority, | ||
} | ||
timer := time.NewTimer(timeout) | ||
defer timer.Stop() | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This would be a public tidb configuration, should it be approved by the PM member according to the current process requirements?