79dfcfad-c74f-4652-9b54-023fe45fb4d0

Committed 24 Mar 2025 08:24PM UTC coverage: 86.865% (-0.02%) from 86.882%

Build # 79dfcfad-c74f-4652-9b54-023fe45fb4d0

Build Type

push

circleci

Committed by

web-flow

Commit Message

Cloud Formation: [POC] Scoping Mechanism, Base Support for Parameters, Dynamic Parameters, Conditions, Intrinsic Functions, and Type Divergence (#12405)

Co-authored-by: Simon Walker <simon.walker@localstack.cloud>

Coverage Stats

459 of 498 new or added lines in 3 files covered. (92.17%)

63 existing lines in 14 files now uncovered.

63144 of 72692 relevant lines covered (86.87%)

0.87 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

84.85

/localstack-core/localstack/services/lambda_/event_source_mapping/pollers/stream_poller.py

import json
import logging
import threading
from abc import abstractmethod
from datetime import datetime
from typing import Iterator

from botocore.client import BaseClient
from botocore.exceptions import ClientError

from localstack.aws.api.pipes import (
    OnPartialBatchItemFailureStreams,
)
from localstack.services.lambda_.event_source_mapping.event_processor import (
    BatchFailureError,
    CustomerInvocationError,
    EventProcessor,
    PartialBatchFailureError,
    PipeInternalError,
)
from localstack.services.lambda_.event_source_mapping.pipe_utils import (
    get_current_time,
    get_datetime_from_timestamp,
    get_internal_client,
)
from localstack.services.lambda_.event_source_mapping.pollers.poller import (
    EmptyPollResultsException,
    Poller,
    get_batch_item_failures,
)
from localstack.services.lambda_.event_source_mapping.pollers.sqs_poller import get_queue_url
from localstack.utils.aws.arns import parse_arn, s3_bucket_name
from localstack.utils.backoff import ExponentialBackoff
from localstack.utils.strings import long_uid

LOG = logging.getLogger(__name__)


# TODO: fix this poller to support resharding
#   https://docs.aws.amazon.com/streams/latest/dev/kinesis-using-sdk-java-resharding.html
class StreamPoller(Poller):
    # Mapping of shard id => shard iterator
    shards: dict[str, str]
    # Iterator for round-robin polling from different shards because a batch cannot contain events from different shards
    # This is a workaround for not handling shards in parallel.
    iterator_over_shards: Iterator[tuple[str, str]] | None
    # ESM UUID is needed in failure processing to form s3 failure destination object key
    esm_uuid: str | None

    # The ARN of the processor (e.g., Pipe ARN)
    partner_resource_arn: str | None

    # Used for backing-off between retries and breaking the retry loop
    _is_shutdown: threading.Event

    def __init__(
        self,
        source_arn: str,
        source_parameters: dict | None = None,
        source_client: BaseClient | None = None,
        processor: EventProcessor | None = None,
        partner_resource_arn: str | None = None,
        esm_uuid: str | None = None,
    ):
        super().__init__(source_arn, source_parameters, source_client, processor)
        self.partner_resource_arn = partner_resource_arn
        self.esm_uuid = esm_uuid
        self.shards = {}
        self.iterator_over_shards = None

        self._is_shutdown = threading.Event()

    @abstractmethod
    def transform_into_events(self, records: list[dict], shard_id) -> list[dict]:
        pass

    @property
    @abstractmethod
    def stream_parameters(self) -> dict:
        pass

    @abstractmethod
    def initialize_shards(self) -> dict[str, str]:
        """Returns a shard dict mapping from shard id -> shard iterator
        The implementations for Kinesis and DynamoDB are similar but differ in various ways:
        * Kinesis uses "StreamARN" and DynamoDB uses "StreamArn" as source parameter
        * Kinesis uses "StreamStatus.ACTIVE" and DynamoDB uses "StreamStatus.ENABLED"
        * Only Kinesis supports the additional StartingPosition called "AT_TIMESTAMP" using "StartingPositionTimestamp"
        """
        pass

    @abstractmethod
    def stream_arn_param(self) -> dict:
        """Returns a dict of the correct key/value pair for the stream arn used in GetRecords.
        Either StreamARN for Kinesis or {} for DynamoDB (unsupported)"""
        pass

    @abstractmethod
    def failure_payload_details_field_name(self) -> str:
        pass

    @abstractmethod
    def get_approximate_arrival_time(self, record: dict) -> float:
        pass

    @abstractmethod
    def format_datetime(self, time: datetime) -> str:
        """Formats a datetime in the correct format for DynamoDB (with ms) or Kinesis (without ms)"""
        pass

    @abstractmethod
    def get_sequence_number(self, record: dict) -> str:
        pass

    def close(self):
        self._is_shutdown.set()

    def pre_filter(self, events: list[dict]) -> list[dict]:
        return events

    def post_filter(self, events: list[dict]) -> list[dict]:
        return events

    def poll_events(self):
        """Generalized poller for streams such as Kinesis or DynamoDB
        Examples of Kinesis consumers:
        * StackOverflow: https://stackoverflow.com/a/22403036/6875981
        * AWS Sample: https://github.com/aws-samples/kinesis-poster-worker/blob/master/worker.py
        Examples of DynamoDB consumers:
        * Blogpost: https://www.tecracer.com/blog/2022/05/getting-a-near-real-time-view-of-a-dynamodb-stream-with-python.html
        """
        # TODO: consider potential shard iterator timeout after 300 seconds (likely not relevant with short-polling):
        #   https://docs.aws.amazon.com/streams/latest/dev/troubleshooting-consumers.html#shard-iterator-expires-unexpectedly
        #  Does this happen if no records are received for 300 seconds?
        if not self.shards:
            self.shards = self.initialize_shards()

        if not self.shards:
            LOG.debug("No shards found for %s.", self.source_arn)
            raise EmptyPollResultsException(service=self.event_source(), source_arn=self.source_arn)
        else:
            LOG.debug("Event source %s has %d shards.", self.source_arn, len(self.shards))

        # TODO: improve efficiency because this currently limits the throughput to at most batch size per poll interval
        # Handle shards round-robin. Re-initialize current shard iterator once all shards are handled.
        if self.iterator_over_shards is None:
            self.iterator_over_shards = iter(self.shards.items())

        current_shard_tuple = next(self.iterator_over_shards, None)
        if not current_shard_tuple:
            self.iterator_over_shards = iter(self.shards.items())
            current_shard_tuple = next(self.iterator_over_shards, None)

        # TODO Better handling when shards are initialised and the iterator returns nothing
        if not current_shard_tuple:
            raise PipeInternalError(
                "Failed to retrieve any shards for stream polling despite initialization."
            )

        try:
            self.poll_events_from_shard(*current_shard_tuple)
        except PipeInternalError:
            # TODO: standardize logging
            # Ignore and wait for the next polling interval, which will do retry
            pass

    def poll_events_from_shard(self, shard_id: str, shard_iterator: str):
        abort_condition = None
        get_records_response = self.get_records(shard_iterator)
        records = get_records_response.get("Records", [])
        if not records:
            # We cannot reliably back-off when no records found since an iterator
            # may have to move multiple times until records are returned.
            # See https://docs.aws.amazon.com/streams/latest/dev/troubleshooting-consumers.html#getrecords-returns-empty
            self.shards[shard_id] = get_records_response["NextShardIterator"]
            return

        polled_events = self.transform_into_events(records, shard_id)

        # Check MaximumRecordAgeInSeconds
        if maximum_record_age_in_seconds := self.stream_parameters.get("MaximumRecordAgeInSeconds"):
            arrival_timestamp_of_last_event = polled_events[-1]["approximateArrivalTimestamp"]
            now = get_current_time().timestamp()
            record_age_in_seconds = now - arrival_timestamp_of_last_event
            if record_age_in_seconds > maximum_record_age_in_seconds:
                abort_condition = "RecordAgeExpired"

        # TODO: implement format detection behavior (e.g., for JSON body):
        #  https://docs.aws.amazon.com/eventbridge/latest/userguide/eb-pipes-event-filtering.html
        #  Check whether we need poller-specific filter-preprocessing here without modifying the actual event!
        # convert to json for filtering (HACK for fixing parity with v1 and getting regression tests passing)
        # localstack.services.lambda_.event_source_listeners.kinesis_event_source_listener.KinesisEventSourceListener._filter_records
        # TODO: explore better abstraction for the entire filtering, including the set_data and get_data remapping
        #  We need better clarify which transformations happen before and after filtering -> fix missing test coverage
        parsed_events = self.pre_filter(polled_events)
        # TODO: advance iterator past matching events!
        #  We need to checkpoint the sequence number for each shard and then advance the shard iterator using
        #  GetShardIterator with a given sequence number
        #  https://docs.aws.amazon.com/kinesis/latest/APIReference/API_GetShardIterator.html
        #  Failing to do so kinda blocks the stream resulting in very high latency.
        matching_events = self.filter_events(parsed_events)
        matching_events_post_filter = self.post_filter(matching_events)

        # TODO: implement MaximumBatchingWindowInSeconds flush condition (before or after filter?)
        # Don't trigger upon empty events
        if len(matching_events_post_filter) == 0:
            # Update shard iterator if no records match the filter
            self.shards[shard_id] = get_records_response["NextShardIterator"]
            return
        events = self.add_source_metadata(matching_events_post_filter)
        LOG.debug("Polled %d events from %s in shard %s", len(events), self.source_arn, shard_id)
        # TODO: A retry should probably re-trigger fetching the record from the stream again?!
        #  -> This could be tested by setting a high retry number, using a long pipe execution, and a relatively
        #  short record expiration age at the source. Check what happens if the record expires at the source.
        #  A potential implementation could use checkpointing based on the iterator position (within shard scope)
        # TODO: handle partial batch failure (see poller.py:parse_batch_item_failures)
        # TODO: think about how to avoid starvation of other shards if one shard runs into infinite retries
        attempts = 0
        error_payload = {}

        max_retries = self.stream_parameters.get("MaximumRetryAttempts", -1)
        # NOTE: max_retries == 0 means exponential backoff is disabled
        boff = ExponentialBackoff(max_retries=max_retries)
        while (
            not abort_condition
            and not self.max_retries_exceeded(attempts)
            and not self._is_shutdown.is_set()
        ):
            try:
                if attempts > 0:
                    # TODO: Should we always backoff (with jitter) before processing since we may not want multiple pollers
                    # all starting up and polling simultaneously
                    # For example: 500 persisted ESMs starting up and requesting concurrently could flood gateway
                    self._is_shutdown.wait(boff.next_backoff())

                self.processor.process_events_batch(events)
                boff.reset()

                # Update shard iterator if execution is successful
                self.shards[shard_id] = get_records_response["NextShardIterator"]
                return
            except PartialBatchFailureError as ex:
                # TODO: add tests for partial batch failure scenarios
                if (
                    self.stream_parameters.get("OnPartialBatchItemFailure")
                    == OnPartialBatchItemFailureStreams.AUTOMATIC_BISECT
                ):
                    # TODO: implement and test splitting batches in half until batch size 1
                    #  https://docs.aws.amazon.com/eventbridge/latest/pipes-reference/API_PipeSourceKinesisStreamParameters.html
                    LOG.warning(
                        "AUTOMATIC_BISECT upon partial batch item failure is not yet implemented. Retrying the entire batch."
                    )
                error_payload = ex.error

                # Extract all sequence numbers from events in batch. This allows us to fail the whole batch if
                # an unknown itemidentifier is returned.
                batch_sequence_numbers = {
                    self.get_sequence_number(event) for event in matching_events
                }

                # If the batchItemFailures array contains multiple items, Lambda uses the record with the lowest sequence number as the checkpoint.
                # Lambda then retries all records starting from that checkpoint.
                failed_sequence_ids: list[int] | None = get_batch_item_failures(
                    ex.partial_failure_payload, batch_sequence_numbers
                )

                # If None is returned, consider the entire batch a failure.
                if failed_sequence_ids is None:
                    continue

                # This shouldn't be possible since a PartialBatchFailureError was raised
                if len(failed_sequence_ids) == 0:
                    assert failed_sequence_ids, (
                        "Invalid state encountered: PartialBatchFailureError raised but no batch item failures found."
                    )

                lowest_sequence_id: str = min(failed_sequence_ids, key=int)

                # Discard all successful events and re-process from sequence number of failed event
                _, events = self.bisect_events(lowest_sequence_id, events)
            except (BatchFailureError, Exception) as ex:
                if isinstance(ex, BatchFailureError):
                    error_payload = ex.error

                # FIXME partner_resource_arn is not defined in ESM
                LOG.debug(
                    "Attempt %d failed while processing %s with events: %s",
                    attempts,
                    self.partner_resource_arn or self.source_arn,
                    events,
                )
            finally:
                # Retry polling until the record expires at the source
                attempts += 1

        # Send failed events to potential DLQ
        abort_condition = abort_condition or "RetryAttemptsExhausted"
        failure_context = self.processor.generate_event_failure_context(
            abort_condition=abort_condition,
            error=error_payload,
            attempts_count=attempts,
            partner_resource_arn=self.partner_resource_arn,
        )
        self.send_events_to_dlq(shard_id, events, context=failure_context)
        # Update shard iterator if the execution failed but the events are sent to a DLQ
        self.shards[shard_id] = get_records_response["NextShardIterator"]

    def get_records(self, shard_iterator: str) -> dict:
        """Returns a GetRecordsOutput from the GetRecords endpoint of streaming services such as Kinesis or DynamoDB"""
        try:
            get_records_response = self.source_client.get_records(
                # TODO: add test for cross-account scenario
                # Differs for Kinesis and DynamoDB but required for cross-account scenario
                **self.stream_arn_param(),
                ShardIterator=shard_iterator,
                Limit=self.stream_parameters["BatchSize"],
            )
            return get_records_response
        # TODO: test iterator expired with conditional error scenario (requires failure destinations)
        except self.source_client.exceptions.ExpiredIteratorException as e:
            LOG.debug(
                "Shard iterator %s expired for stream %s, re-initializing shards",
                shard_iterator,
                self.source_arn,
            )
            # TODO: test TRIM_HORIZON and AT_TIMESTAMP scenarios for this case. We don't want to start from scratch and
            #  might need to think about checkpointing here.
            self.shards = self.initialize_shards()
            raise PipeInternalError from e
        except ClientError as e:
            if "AccessDeniedException" in str(e):
                LOG.warning(
                    "Insufficient permissions to get records from stream %s: %s",
                    self.source_arn,
                    e,
                )
                raise CustomerInvocationError from e
            elif "ResourceNotFoundException" in str(e):
                # FIXME: The 'Invalid ShardId in ShardIterator' error is returned by DynamoDB-local. Unsure when/why this is returned.
                if "Invalid ShardId in ShardIterator" in str(e):
                    LOG.warning(
                        "Invalid ShardId in ShardIterator for %s. Re-initializing shards.",
                        self.source_arn,
                    )
                    self.shards = self.initialize_shards()
                else:
                    LOG.warning(
                        "Source stream %s does not exist: %s",
                        self.source_arn,
                        e,
                    )
                    raise CustomerInvocationError from e
            elif "TrimmedDataAccessException" in str(e):
                LOG.debug(
                    "Attempted to iterate over trimmed record or expired shard iterator %s for stream %s, re-initializing shards",
                    shard_iterator,
                    self.source_arn,
                )
                self.shards = self.initialize_shards()
            else:
                LOG.debug("ClientError during get_records for stream %s: %s", self.source_arn, e)
            raise PipeInternalError from e

    def send_events_to_dlq(self, shard_id, events, context) -> None:
        dlq_arn = self.stream_parameters.get("DeadLetterConfig", {}).get("Arn")
        if dlq_arn:
            failure_timstamp = get_current_time()
            dlq_event = self.create_dlq_event(shard_id, events, context, failure_timstamp)
            # Send DLQ event to DLQ target
            parsed_arn = parse_arn(dlq_arn)
            service = parsed_arn["service"]
            # TODO: use a sender instance here, likely inject via DI into poller (what if it updates?)
            if service == "sqs":
                # TODO: inject and cache SQS client using proper IAM role (supports cross-account operations)
                sqs_client = get_internal_client(dlq_arn)
                # TODO: check if the DLQ exists
                dlq_url = get_queue_url(dlq_arn)
                # TODO: validate no FIFO queue because they are unsupported
                sqs_client.send_message(QueueUrl=dlq_url, MessageBody=json.dumps(dlq_event))
            elif service == "sns":
                sns_client = get_internal_client(dlq_arn)
                sns_client.publish(TopicArn=dlq_arn, Message=json.dumps(dlq_event))
            elif service == "s3":
                s3_client = get_internal_client(dlq_arn)
                dlq_event_with_payload = {
                    **dlq_event,
                    "payload": {
                        "Records": events,
                    },
                }
                s3_client.put_object(
                    Bucket=s3_bucket_name(dlq_arn),
                    Key=get_failure_s3_object_key(self.esm_uuid, shard_id, failure_timstamp),
                    Body=json.dumps(dlq_event_with_payload),
                )
            else:
                LOG.warning("Unsupported DLQ service %s", service)

    def create_dlq_event(
        self, shard_id: str, events: list[dict], context: dict, failure_timestamp: datetime
    ) -> dict:
        first_record = events[0]
        first_record_arrival = get_datetime_from_timestamp(
            self.get_approximate_arrival_time(first_record)
        )

        last_record = events[-1]
        last_record_arrival = get_datetime_from_timestamp(
            self.get_approximate_arrival_time(last_record)
        )
        return {
            **context,
            self.failure_payload_details_field_name(): {
                "approximateArrivalOfFirstRecord": self.format_datetime(first_record_arrival),
                "approximateArrivalOfLastRecord": self.format_datetime(last_record_arrival),
                "batchSize": len(events),
                "endSequenceNumber": self.get_sequence_number(last_record),
                "shardId": shard_id,
                "startSequenceNumber": self.get_sequence_number(first_record),
                "streamArn": self.source_arn,
            },
            "timestamp": failure_timestamp.isoformat(timespec="milliseconds").replace(
                "+00:00", "Z"
            ),
            "version": "1.0",
        }

    def max_retries_exceeded(self, attempts: int) -> bool:
        maximum_retry_attempts = self.stream_parameters.get("MaximumRetryAttempts", -1)
        # Infinite retries until the source expires
        if maximum_retry_attempts == -1:
            return False
        return attempts > maximum_retry_attempts

    def bisect_events(
        self, sequence_number: str, events: list[dict]
    ) -> tuple[list[dict], list[dict]]:
        """Splits list of events in two, where a sequence number equals a passed parameter `sequence_number`.
        This is used for:
          - `ReportBatchItemFailures`: Discarding events in a batch following a failure when is set.
          - `BisectBatchOnFunctionError`: Used to split a failed batch in two when doing a retry (not implemented)."""
        for i, event in enumerate(events):
            if self.get_sequence_number(event) == sequence_number:
                return events[:i], events[i:]

        return events, []


def get_failure_s3_object_key(esm_uuid: str, shard_id: str, failure_datetime: datetime) -> str:
    """
    From https://docs.aws.amazon.com/lambda/latest/dg/kinesis-on-failure-destination.html:

    The S3 object containing the invocation record uses the following naming convention:
    aws/lambda/<ESM-UUID>/<shardID>/YYYY/MM/DD/YYYY-MM-DDTHH.MM.SS-<Random UUID>

    :return: Key for s3 object that invocation failure record will be put to
    """
    timestamp = failure_datetime.strftime("%Y-%m-%dT%H.%M.%S")
    year_month_day = failure_datetime.strftime("%Y/%m/%d")
    random_uuid = long_uid()
    return f"aws/lambda/{esm_uuid}/{shard_id}/{year_month_day}/{timestamp}-{random_uuid}"

1	import json	1✔
2	import logging	1✔
3	import threading	1✔
4	from abc import abstractmethod	1✔
5	from datetime import datetime	1✔
6	from typing import Iterator	1✔
7
8	from botocore.client import BaseClient	1✔
9	from botocore.exceptions import ClientError	1✔
10
11	from localstack.aws.api.pipes import (	1✔
12	OnPartialBatchItemFailureStreams,
13	)
14	from localstack.services.lambda_.event_source_mapping.event_processor import (	1✔
15	BatchFailureError,
16	CustomerInvocationError,
17	EventProcessor,
18	PartialBatchFailureError,
19	PipeInternalError,
20	)
21	from localstack.services.lambda_.event_source_mapping.pipe_utils import (	1✔
22	get_current_time,
23	get_datetime_from_timestamp,
24	get_internal_client,
25	)
26	from localstack.services.lambda_.event_source_mapping.pollers.poller import (	1✔
27	EmptyPollResultsException,
28	Poller,
29	get_batch_item_failures,
30	)
31	from localstack.services.lambda_.event_source_mapping.pollers.sqs_poller import get_queue_url	1✔
32	from localstack.utils.aws.arns import parse_arn, s3_bucket_name	1✔
33	from localstack.utils.backoff import ExponentialBackoff	1✔
34	from localstack.utils.strings import long_uid	1✔
35
36	LOG = logging.getLogger(__name__)	1✔
37
38
39	# TODO: fix this poller to support resharding
40	# https://docs.aws.amazon.com/streams/latest/dev/kinesis-using-sdk-java-resharding.html
41	class StreamPoller(Poller):	1✔
42	# Mapping of shard id => shard iterator
43	shards: dict[str, str]	1✔
44	# Iterator for round-robin polling from different shards because a batch cannot contain events from different shards
45	# This is a workaround for not handling shards in parallel.
46	iterator_over_shards: Iterator[tuple[str, str]] \| None	1✔
47	# ESM UUID is needed in failure processing to form s3 failure destination object key
48	esm_uuid: str \| None	1✔
49
50	# The ARN of the processor (e.g., Pipe ARN)
51	partner_resource_arn: str \| None	1✔
52
53	# Used for backing-off between retries and breaking the retry loop
54	_is_shutdown: threading.Event	1✔
55
56	def __init__(	1✔
57	self,
58	source_arn: str,
59	source_parameters: dict \| None = None,
60	source_client: BaseClient \| None = None,
61	processor: EventProcessor \| None = None,
62	partner_resource_arn: str \| None = None,
63	esm_uuid: str \| None = None,
64	):
65	super().__init__(source_arn, source_parameters, source_client, processor)	1✔
66	self.partner_resource_arn = partner_resource_arn	1✔
67	self.esm_uuid = esm_uuid	1✔
68	self.shards = {}	1✔
69	self.iterator_over_shards = None	1✔
70
71	self._is_shutdown = threading.Event()	1✔
72
73	@abstractmethod	1✔
74	def transform_into_events(self, records: list[dict], shard_id) -> list[dict]:	1✔
75	pass	×
76
77	@property	1✔
78	@abstractmethod	1✔
79	def stream_parameters(self) -> dict:	1✔
80	pass	×
81
82	@abstractmethod	1✔
83	def initialize_shards(self) -> dict[str, str]:	1✔
84	"""Returns a shard dict mapping from shard id -> shard iterator
85	The implementations for Kinesis and DynamoDB are similar but differ in various ways:
86	* Kinesis uses "StreamARN" and DynamoDB uses "StreamArn" as source parameter
87	* Kinesis uses "StreamStatus.ACTIVE" and DynamoDB uses "StreamStatus.ENABLED"
88	* Only Kinesis supports the additional StartingPosition called "AT_TIMESTAMP" using "StartingPositionTimestamp"
89	"""
90	pass	×
91
92	@abstractmethod	1✔
93	def stream_arn_param(self) -> dict:	1✔
94	"""Returns a dict of the correct key/value pair for the stream arn used in GetRecords.
95	Either StreamARN for Kinesis or {} for DynamoDB (unsupported)"""
96	pass	×
97
98	@abstractmethod	1✔
99	def failure_payload_details_field_name(self) -> str:	1✔
100	pass	×
101
102	@abstractmethod	1✔
103	def get_approximate_arrival_time(self, record: dict) -> float:	1✔
104	pass	×
105
106	@abstractmethod	1✔
107	def format_datetime(self, time: datetime) -> str:	1✔
108	"""Formats a datetime in the correct format for DynamoDB (with ms) or Kinesis (without ms)"""
109	pass	×
110
111	@abstractmethod	1✔
112	def get_sequence_number(self, record: dict) -> str:	1✔
113	pass	×
114
115	def close(self):	1✔
116	self._is_shutdown.set()	1✔
117
118	def pre_filter(self, events: list[dict]) -> list[dict]:	1✔
119	return events	1✔
120
121	def post_filter(self, events: list[dict]) -> list[dict]:	1✔
122	return events	1✔
123
124	def poll_events(self):	1✔
125	"""Generalized poller for streams such as Kinesis or DynamoDB
126	Examples of Kinesis consumers:
127	* StackOverflow: https://stackoverflow.com/a/22403036/6875981
128	* AWS Sample: https://github.com/aws-samples/kinesis-poster-worker/blob/master/worker.py
129	Examples of DynamoDB consumers:
130	* Blogpost: https://www.tecracer.com/blog/2022/05/getting-a-near-real-time-view-of-a-dynamodb-stream-with-python.html
131	"""
132	# TODO: consider potential shard iterator timeout after 300 seconds (likely not relevant with short-polling):
133	# https://docs.aws.amazon.com/streams/latest/dev/troubleshooting-consumers.html#shard-iterator-expires-unexpectedly
134	# Does this happen if no records are received for 300 seconds?
135	if not self.shards:	1✔
136	self.shards = self.initialize_shards()	1✔
137
138	if not self.shards:	1✔
139	LOG.debug("No shards found for %s.", self.source_arn)	1✔
140	raise EmptyPollResultsException(service=self.event_source(), source_arn=self.source_arn)	1✔
141	else:
142	LOG.debug("Event source %s has %d shards.", self.source_arn, len(self.shards))	1✔
143
144	# TODO: improve efficiency because this currently limits the throughput to at most batch size per poll interval
145	# Handle shards round-robin. Re-initialize current shard iterator once all shards are handled.
146	if self.iterator_over_shards is None:	1✔
147	self.iterator_over_shards = iter(self.shards.items())	1✔
148
149	current_shard_tuple = next(self.iterator_over_shards, None)	1✔
150	if not current_shard_tuple:	1✔
151	self.iterator_over_shards = iter(self.shards.items())	1✔
152	current_shard_tuple = next(self.iterator_over_shards, None)	1✔
153
154	# TODO Better handling when shards are initialised and the iterator returns nothing
155	if not current_shard_tuple:	1✔
156	raise PipeInternalError(	×
157	"Failed to retrieve any shards for stream polling despite initialization."
158	)
159
160	try:	1✔
161	self.poll_events_from_shard(*current_shard_tuple)	1✔
162	except PipeInternalError:	1✔
163	# TODO: standardize logging
164	# Ignore and wait for the next polling interval, which will do retry
165	pass	1✔
166
167	def poll_events_from_shard(self, shard_id: str, shard_iterator: str):	1✔
168	abort_condition = None	1✔
169	get_records_response = self.get_records(shard_iterator)	1✔
170	records = get_records_response.get("Records", [])	1✔
171	if not records:	1✔
172	# We cannot reliably back-off when no records found since an iterator
173	# may have to move multiple times until records are returned.
174	# See https://docs.aws.amazon.com/streams/latest/dev/troubleshooting-consumers.html#getrecords-returns-empty
175	self.shards[shard_id] = get_records_response["NextShardIterator"]	1✔
176	return	1✔
177
178	polled_events = self.transform_into_events(records, shard_id)	1✔
179
180	# Check MaximumRecordAgeInSeconds
181	if maximum_record_age_in_seconds := self.stream_parameters.get("MaximumRecordAgeInSeconds"):	1✔
182	arrival_timestamp_of_last_event = polled_events[-1]["approximateArrivalTimestamp"]	×
183	now = get_current_time().timestamp()	×
184	record_age_in_seconds = now - arrival_timestamp_of_last_event	×
185	if record_age_in_seconds > maximum_record_age_in_seconds:	×
186	abort_condition = "RecordAgeExpired"	×
187
188	# TODO: implement format detection behavior (e.g., for JSON body):
189	# https://docs.aws.amazon.com/eventbridge/latest/userguide/eb-pipes-event-filtering.html
190	# Check whether we need poller-specific filter-preprocessing here without modifying the actual event!
191	# convert to json for filtering (HACK for fixing parity with v1 and getting regression tests passing)
192	# localstack.services.lambda_.event_source_listeners.kinesis_event_source_listener.KinesisEventSourceListener._filter_records
193	# TODO: explore better abstraction for the entire filtering, including the set_data and get_data remapping
194	# We need better clarify which transformations happen before and after filtering -> fix missing test coverage
195	parsed_events = self.pre_filter(polled_events)	1✔
196	# TODO: advance iterator past matching events!
197	# We need to checkpoint the sequence number for each shard and then advance the shard iterator using
198	# GetShardIterator with a given sequence number
199	# https://docs.aws.amazon.com/kinesis/latest/APIReference/API_GetShardIterator.html
200	# Failing to do so kinda blocks the stream resulting in very high latency.
201	matching_events = self.filter_events(parsed_events)	1✔
202	matching_events_post_filter = self.post_filter(matching_events)	1✔
203
204	# TODO: implement MaximumBatchingWindowInSeconds flush condition (before or after filter?)
205	# Don't trigger upon empty events
206	if len(matching_events_post_filter) == 0:	1✔
207	# Update shard iterator if no records match the filter
208	self.shards[shard_id] = get_records_response["NextShardIterator"]	1✔
209	return	1✔
210	events = self.add_source_metadata(matching_events_post_filter)	1✔
211	LOG.debug("Polled %d events from %s in shard %s", len(events), self.source_arn, shard_id)	1✔
212	# TODO: A retry should probably re-trigger fetching the record from the stream again?!
213	# -> This could be tested by setting a high retry number, using a long pipe execution, and a relatively
214	# short record expiration age at the source. Check what happens if the record expires at the source.
215	# A potential implementation could use checkpointing based on the iterator position (within shard scope)
216	# TODO: handle partial batch failure (see poller.py:parse_batch_item_failures)
217	# TODO: think about how to avoid starvation of other shards if one shard runs into infinite retries
218	attempts = 0	1✔
219	error_payload = {}	1✔
220
221	max_retries = self.stream_parameters.get("MaximumRetryAttempts", -1)	1✔
222	# NOTE: max_retries == 0 means exponential backoff is disabled
223	boff = ExponentialBackoff(max_retries=max_retries)	1✔
224	while (	1✔
225	not abort_condition
226	and not self.max_retries_exceeded(attempts)
227	and not self._is_shutdown.is_set()
228	):
229	try:	1✔
230	if attempts > 0:	1✔
231	# TODO: Should we always backoff (with jitter) before processing since we may not want multiple pollers
232	# all starting up and polling simultaneously
233	# For example: 500 persisted ESMs starting up and requesting concurrently could flood gateway
234	self._is_shutdown.wait(boff.next_backoff())	1✔
235
236	self.processor.process_events_batch(events)	1✔
237	boff.reset()	1✔
238
239	# Update shard iterator if execution is successful
240	self.shards[shard_id] = get_records_response["NextShardIterator"]	1✔
241	return	1✔
242	except PartialBatchFailureError as ex:	1✔
243	# TODO: add tests for partial batch failure scenarios
244	if (	1✔
245	self.stream_parameters.get("OnPartialBatchItemFailure")
246	== OnPartialBatchItemFailureStreams.AUTOMATIC_BISECT
247	):
248	# TODO: implement and test splitting batches in half until batch size 1
249	# https://docs.aws.amazon.com/eventbridge/latest/pipes-reference/API_PipeSourceKinesisStreamParameters.html
250	LOG.warning(	×
251	"AUTOMATIC_BISECT upon partial batch item failure is not yet implemented. Retrying the entire batch."
252	)
253	error_payload = ex.error	1✔
254
255	# Extract all sequence numbers from events in batch. This allows us to fail the whole batch if
256	# an unknown itemidentifier is returned.
257	batch_sequence_numbers = {	1✔
258	self.get_sequence_number(event) for event in matching_events
259	}
260
261	# If the batchItemFailures array contains multiple items, Lambda uses the record with the lowest sequence number as the checkpoint.
262	# Lambda then retries all records starting from that checkpoint.
263	failed_sequence_ids: list[int] \| None = get_batch_item_failures(	1✔
264	ex.partial_failure_payload, batch_sequence_numbers
265	)
266
267	# If None is returned, consider the entire batch a failure.
268	if failed_sequence_ids is None:	1✔
269	continue	1✔
270
271	# This shouldn't be possible since a PartialBatchFailureError was raised
272	if len(failed_sequence_ids) == 0:	1✔
273	assert failed_sequence_ids, (	×
274	"Invalid state encountered: PartialBatchFailureError raised but no batch item failures found."
275	)
276
277	lowest_sequence_id: str = min(failed_sequence_ids, key=int)	1✔
278
279	# Discard all successful events and re-process from sequence number of failed event
280	_, events = self.bisect_events(lowest_sequence_id, events)	1✔
281	except (BatchFailureError, Exception) as ex:	1✔
282	if isinstance(ex, BatchFailureError):	1✔
283	error_payload = ex.error	1✔
284
285	# FIXME partner_resource_arn is not defined in ESM
286	LOG.debug(	1✔
287	"Attempt %d failed while processing %s with events: %s",
288	attempts,
289	self.partner_resource_arn or self.source_arn,
290	events,
291	)
292	finally:
293	# Retry polling until the record expires at the source
294	attempts += 1	1✔
295
296	# Send failed events to potential DLQ
297	abort_condition = abort_condition or "RetryAttemptsExhausted"	1✔
298	failure_context = self.processor.generate_event_failure_context(	1✔
299	abort_condition=abort_condition,
300	error=error_payload,
301	attempts_count=attempts,
302	partner_resource_arn=self.partner_resource_arn,
303	)
304	self.send_events_to_dlq(shard_id, events, context=failure_context)	1✔
305	# Update shard iterator if the execution failed but the events are sent to a DLQ
306	self.shards[shard_id] = get_records_response["NextShardIterator"]	1✔
307
308	def get_records(self, shard_iterator: str) -> dict:	1✔
309	"""Returns a GetRecordsOutput from the GetRecords endpoint of streaming services such as Kinesis or DynamoDB"""
310	try:	1✔
311	get_records_response = self.source_client.get_records(	1✔
312	# TODO: add test for cross-account scenario
313	# Differs for Kinesis and DynamoDB but required for cross-account scenario
314	**self.stream_arn_param(),
315	ShardIterator=shard_iterator,
316	Limit=self.stream_parameters["BatchSize"],
317	)
318	return get_records_response	1✔
319	# TODO: test iterator expired with conditional error scenario (requires failure destinations)
320	except self.source_client.exceptions.ExpiredIteratorException as e:	1✔
321	LOG.debug(	×
322	"Shard iterator %s expired for stream %s, re-initializing shards",
323	shard_iterator,
324	self.source_arn,
325	)
326	# TODO: test TRIM_HORIZON and AT_TIMESTAMP scenarios for this case. We don't want to start from scratch and
327	# might need to think about checkpointing here.
328	self.shards = self.initialize_shards()	×
329	raise PipeInternalError from e	×
330	except ClientError as e:	1✔
331	if "AccessDeniedException" in str(e):	1✔
332	LOG.warning(	×
333	"Insufficient permissions to get records from stream %s: %s",
334	self.source_arn,
335	e,
336	)
337	raise CustomerInvocationError from e	×
338	elif "ResourceNotFoundException" in str(e):	1✔
339	# FIXME: The 'Invalid ShardId in ShardIterator' error is returned by DynamoDB-local. Unsure when/why this is returned.
UNCOV 340	if "Invalid ShardId in ShardIterator" in str(e):	×
341	LOG.warning(	×
342	"Invalid ShardId in ShardIterator for %s. Re-initializing shards.",
343	self.source_arn,
344	)
345	self.shards = self.initialize_shards()	×
346	else:
UNCOV 347	LOG.warning(	×
348	"Source stream %s does not exist: %s",
349	self.source_arn,
350	e,
351	)
UNCOV 352	raise CustomerInvocationError from e	×
353	elif "TrimmedDataAccessException" in str(e):	1✔
354	LOG.debug(	×
355	"Attempted to iterate over trimmed record or expired shard iterator %s for stream %s, re-initializing shards",
356	shard_iterator,
357	self.source_arn,
358	)
359	self.shards = self.initialize_shards()	×
360	else:
361	LOG.debug("ClientError during get_records for stream %s: %s", self.source_arn, e)	1✔
362	raise PipeInternalError from e	1✔
363
364	def send_events_to_dlq(self, shard_id, events, context) -> None:	1✔
365	dlq_arn = self.stream_parameters.get("DeadLetterConfig", {}).get("Arn")	1✔
366	if dlq_arn:	1✔
367	failure_timstamp = get_current_time()	1✔
368	dlq_event = self.create_dlq_event(shard_id, events, context, failure_timstamp)	1✔
369	# Send DLQ event to DLQ target
370	parsed_arn = parse_arn(dlq_arn)	1✔
371	service = parsed_arn["service"]	1✔
372	# TODO: use a sender instance here, likely inject via DI into poller (what if it updates?)
373	if service == "sqs":	1✔
374	# TODO: inject and cache SQS client using proper IAM role (supports cross-account operations)
375	sqs_client = get_internal_client(dlq_arn)	1✔
376	# TODO: check if the DLQ exists
377	dlq_url = get_queue_url(dlq_arn)	1✔
378	# TODO: validate no FIFO queue because they are unsupported
379	sqs_client.send_message(QueueUrl=dlq_url, MessageBody=json.dumps(dlq_event))	1✔
380	elif service == "sns":	1✔
381	sns_client = get_internal_client(dlq_arn)	1✔
382	sns_client.publish(TopicArn=dlq_arn, Message=json.dumps(dlq_event))	1✔
383	elif service == "s3":	1✔
384	s3_client = get_internal_client(dlq_arn)	1✔
385	dlq_event_with_payload = {	1✔
386	**dlq_event,
387	"payload": {
388	"Records": events,
389	},
390	}
391	s3_client.put_object(	1✔
392	Bucket=s3_bucket_name(dlq_arn),
393	Key=get_failure_s3_object_key(self.esm_uuid, shard_id, failure_timstamp),
394	Body=json.dumps(dlq_event_with_payload),
395	)
396	else:
397	LOG.warning("Unsupported DLQ service %s", service)	×
398
399	def create_dlq_event(	1✔
400	self, shard_id: str, events: list[dict], context: dict, failure_timestamp: datetime
401	) -> dict:
402	first_record = events[0]	1✔
403	first_record_arrival = get_datetime_from_timestamp(	1✔
404	self.get_approximate_arrival_time(first_record)
405	)
406
407	last_record = events[-1]	1✔
408	last_record_arrival = get_datetime_from_timestamp(	1✔
409	self.get_approximate_arrival_time(last_record)
410	)
411	return {	1✔
412	**context,
413	self.failure_payload_details_field_name(): {
414	"approximateArrivalOfFirstRecord": self.format_datetime(first_record_arrival),
415	"approximateArrivalOfLastRecord": self.format_datetime(last_record_arrival),
416	"batchSize": len(events),
417	"endSequenceNumber": self.get_sequence_number(last_record),
418	"shardId": shard_id,
419	"startSequenceNumber": self.get_sequence_number(first_record),
420	"streamArn": self.source_arn,
421	},
422	"timestamp": failure_timestamp.isoformat(timespec="milliseconds").replace(
423	"+00:00", "Z"
424	),
425	"version": "1.0",
426	}
427
428	def max_retries_exceeded(self, attempts: int) -> bool:	1✔
429	maximum_retry_attempts = self.stream_parameters.get("MaximumRetryAttempts", -1)	1✔
430	# Infinite retries until the source expires
431	if maximum_retry_attempts == -1:	1✔
432	return False	1✔
433	return attempts > maximum_retry_attempts	1✔
434
435	def bisect_events(	1✔
436	self, sequence_number: str, events: list[dict]
437	) -> tuple[list[dict], list[dict]]:
438	"""Splits list of events in two, where a sequence number equals a passed parameter `sequence_number`.
439	This is used for:
440	- `ReportBatchItemFailures`: Discarding events in a batch following a failure when is set.
441	- `BisectBatchOnFunctionError`: Used to split a failed batch in two when doing a retry (not implemented)."""
442	for i, event in enumerate(events):	1✔
443	if self.get_sequence_number(event) == sequence_number:	1✔
444	return events[:i], events[i:]	1✔
445
446	return events, []	×
447
448
449	def get_failure_s3_object_key(esm_uuid: str, shard_id: str, failure_datetime: datetime) -> str:	1✔
450	"""
451	From https://docs.aws.amazon.com/lambda/latest/dg/kinesis-on-failure-destination.html:
452
453	The S3 object containing the invocation record uses the following naming convention:
454	aws/lambda/<ESM-UUID>/<shardID>/YYYY/MM/DD/YYYY-MM-DDTHH.MM.SS-<Random UUID>
455
456	:return: Key for s3 object that invocation failure record will be put to
457	"""
458	timestamp = failure_datetime.strftime("%Y-%m-%dT%H.%M.%S")	1✔
459	year_month_day = failure_datetime.strftime("%Y/%m/%d")	1✔
460	random_uuid = long_uid()	1✔
461	return f"aws/lambda/{esm_uuid}/{shard_id}/{year_month_day}/{timestamp}-{random_uuid}"	1✔

localstack / localstack / 79dfcfad-c74f-4652-9b54-023fe45fb4d0

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous