-
Notifications
You must be signed in to change notification settings - Fork 80
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: Add ability to bundle all records from one micro-batch into PutRecords #86
base: master
Are you sure you want to change the base?
Changes from 5 commits
6a859e8
e346bc4
fb4cf40
3a8a515
f75312d
321333f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -43,11 +43,73 @@ private[kinesis] class KinesisWriteTask(producerConfiguration: Map[String, Strin | |
s"${KinesisSourceProvider.SINK_FLUSH_WAIT_TIME_MILLIS} has to be a positive integer") | ||
} | ||
|
||
private val sinKBundleRecords = Try(producerConfiguration.getOrElse( | ||
KinesisSourceProvider.SINK_BUNDLE_RECORDS, | ||
KinesisSourceProvider.DEFAULT_SINK_BUNDLE_RECORDS).toBoolean).getOrElse { | ||
throw new IllegalArgumentException( | ||
s"${KinesisSourceProvider.SINK_BUNDLE_RECORDS} has to be a boolean value") | ||
} | ||
|
||
private var failedWrite: Throwable = _ | ||
|
||
|
||
def execute(iterator: Iterator[InternalRow]): Unit = { | ||
|
||
if (sinKBundleRecords) { | ||
bundleExecute(iterator) | ||
} else { | ||
singleExecute(iterator) | ||
} | ||
|
||
} | ||
|
||
private def bundleExecute(iterator: Iterator[InternalRow]): Unit = { | ||
|
||
val groupedIterator: iterator.GroupedIterator[InternalRow] = iterator.grouped(490) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what is 490 here? Should it be configurable? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 |
||
|
||
while (groupedIterator.hasNext) { | ||
val rowList = groupedIterator.next() | ||
sendBundledData(rowList) | ||
} | ||
|
||
} | ||
|
||
private def sendBundledData(rowList: List[InternalRow]): Unit = { | ||
producer = CachedKinesisProducer.getOrCreate(producerConfiguration) | ||
|
||
val kinesisCallBack = new FutureCallback[UserRecordResult]() { | ||
|
||
override def onFailure(t: Throwable): Unit = { | ||
if (failedWrite == null && t!= null) { | ||
failedWrite = t | ||
logError(s"Writing to $streamName failed due to ${t.getCause}") | ||
} | ||
} | ||
|
||
override def onSuccess(result: UserRecordResult): Unit = { | ||
logDebug(s"Successfully put records: \n " + | ||
s"sequenceNumber=${result.getSequenceNumber}, \n" + | ||
s"shardId=${result.getShardId}, \n" + | ||
s"attempts=${result.getAttempts.size}") | ||
} | ||
} | ||
|
||
for (r <- rowList) { | ||
|
||
val projectedRow = projection(r) | ||
val partitionKey = projectedRow.getString(0) | ||
val data = projectedRow.getBinary(1) | ||
|
||
val future = producer.addUserRecord(streamName, partitionKey, ByteBuffer.wrap(data)) | ||
|
||
Futures.addCallback(future, kinesisCallBack) | ||
|
||
} | ||
} | ||
|
||
private def singleExecute(iterator: Iterator[InternalRow]): Unit = { | ||
producer = CachedKinesisProducer.getOrCreate(producerConfiguration) | ||
|
||
while (iterator.hasNext && failedWrite == null) { | ||
val currentRow = iterator.next() | ||
val projectedRow = projection(currentRow) | ||
|
@@ -56,11 +118,10 @@ private[kinesis] class KinesisWriteTask(producerConfiguration: Map[String, Strin | |
|
||
sendData(partitionKey, data) | ||
} | ||
} | ||
|
||
def sendData(partitionKey: String, data: Array[Byte]): String = { | ||
var sentSeqNumbers = new String | ||
} | ||
|
||
private def sendData(partitionKey: String, data: Array[Byte]): Unit = { | ||
val future = producer.addUserRecord(streamName, partitionKey, ByteBuffer.wrap(data)) | ||
|
||
val kinesisCallBack = new FutureCallback[UserRecordResult]() { | ||
|
@@ -73,14 +134,17 @@ private[kinesis] class KinesisWriteTask(producerConfiguration: Map[String, Strin | |
} | ||
|
||
override def onSuccess(result: UserRecordResult): Unit = { | ||
val shardId = result.getShardId | ||
sentSeqNumbers = result.getSequenceNumber | ||
logDebug(s"Successfully put records: \n " + | ||
s"sequenceNumber=${result.getSequenceNumber}, \n" + | ||
s"shardId=${result.getShardId}, \n" + | ||
s"attempts=${result.getAttempts.size}") | ||
} | ||
|
||
} | ||
|
||
Futures.addCallback(future, kinesisCallBack) | ||
|
||
producer.flushSync() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @leslieyanyan @itsvikramagr The new code in this PR is showing improved performance because method We'll need to separately evaluate how much performance impact we're getting by using |
||
sentSeqNumbers | ||
} | ||
|
||
private def flushRecordsIfNecessary(): Unit = { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we have added "kinesis.executor.recordTtl" - can we add details about this config here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1