refined-lunch-94030
05/10/2023, 9:59 AMTraceback (most recent call last):
File "/usr/local/lib/python3.10/site-packages/urllib3/connection.py", line 174, in _new_conn
conn = connection.create_connection(
File "/usr/local/lib/python3.10/site-packages/urllib3/util/connection.py", line 72, in create_connection
for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
File "/usr/local/lib/python3.10/socket.py", line 955, in getaddrinfo
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno -3] Temporary failure in name resolution
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.10/site-packages/botocore/httpsession.py", line 465, in send
urllib_response = conn.urlopen(
File "/usr/local/lib/python3.10/site-packages/urllib3/connectionpool.py", line 787, in urlopen
retries = retries.increment(
File "/usr/local/lib/python3.10/site-packages/urllib3/util/retry.py", line 525, in increment
raise six.reraise(type(error), error, _stacktrace)
File "/usr/local/lib/python3.10/site-packages/urllib3/packages/six.py", line 770, in reraise
raise value
File "/usr/local/lib/python3.10/site-packages/urllib3/connectionpool.py", line 703, in urlopen
httplib_response = self._make_request(
File "/usr/local/lib/python3.10/site-packages/urllib3/connectionpool.py", line 386, in _make_request
self._validate_conn(conn)
File "/usr/local/lib/python3.10/site-packages/urllib3/connectionpool.py", line 1042, in _validate_conn
conn.connect()
File "/usr/local/lib/python3.10/site-packages/urllib3/connection.py", line 363, in connect
self.sock = conn = self._new_conn()
File "/usr/local/lib/python3.10/site-packages/urllib3/connection.py", line 186, in _new_conn
raise NewConnectionError(
urllib3.exceptions.NewConnectionError: <botocore.awsrequest.AWSHTTPSConnection object at 0x7f6941b61150>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.10/site-packages/flytekit/core/data_persistence.py", line 301, in get_data
self.get(remote_path, to_path=local_path, recursive=is_multipart)
File "/usr/local/lib/python3.10/site-packages/flytekit/core/data_persistence.py", line 199, in get
return file_system.get(from_path, to_path, recursive=recursive)
File "/usr/local/lib/python3.10/site-packages/fsspec/spec.py", line 893, in get
rpaths = [p for p in rpaths if not (trailing_sep(p) or self.isdir(p))]
File "/usr/local/lib/python3.10/site-packages/fsspec/spec.py", line 893, in <listcomp>
rpaths = [p for p in rpaths if not (trailing_sep(p) or self.isdir(p))]
File "/usr/local/lib/python3.10/site-packages/s3fs/core.py", line 601, in isdir
return bool(self._lsdir(path))
File "/usr/local/lib/python3.10/site-packages/s3fs/core.py", line 394, in _lsdir
for i in it:
File "/usr/local/lib/python3.10/site-packages/botocore/paginate.py", line 269, in __iter__
response = self._make_request(current_kwargs)
File "/usr/local/lib/python3.10/site-packages/botocore/paginate.py", line 357, in _make_request
return self._method(**current_kwargs)
File "/usr/local/lib/python3.10/site-packages/botocore/client.py", line 530, in _api_call
return self._make_api_call(operation_name, kwargs)
File "/usr/local/lib/python3.10/site-packages/botocore/client.py", line 943, in _make_api_call
http, parsed_response = self._make_request(
File "/usr/local/lib/python3.10/site-packages/botocore/client.py", line 966, in _make_request
return self._endpoint.make_request(operation_model, request_dict)
File "/usr/local/lib/python3.10/site-packages/botocore/endpoint.py", line 119, in make_request
return self._send_request(request_dict, operation_model)
File "/usr/local/lib/python3.10/site-packages/botocore/endpoint.py", line 202, in _send_request
while self._needs_retry(
File "/usr/local/lib/python3.10/site-packages/botocore/endpoint.py", line 354, in _needs_retry
responses = self._event_emitter.emit(
File "/usr/local/lib/python3.10/site-packages/botocore/hooks.py", line 412, in emit
return self._emitter.emit(aliased_event_name, **kwargs)
File "/usr/local/lib/python3.10/site-packages/botocore/hooks.py", line 256, in emit
return self._emit(event_name, kwargs)
File "/usr/local/lib/python3.10/site-packages/botocore/hooks.py", line 239, in _emit
response = handler(**kwargs)
File "/usr/local/lib/python3.10/site-packages/botocore/retryhandler.py", line 207, in __call__
if self._checker(**checker_kwargs):
File "/usr/local/lib/python3.10/site-packages/botocore/retryhandler.py", line 284, in __call__
should_retry = self._should_retry(
File "/usr/local/lib/python3.10/site-packages/botocore/retryhandler.py", line 320, in _should_retry
return self._checker(attempt_number, response, caught_exception)
File "/usr/local/lib/python3.10/site-packages/botocore/retryhandler.py", line 363, in __call__
checker_response = checker(
File "/usr/local/lib/python3.10/site-packages/botocore/retryhandler.py", line 247, in __call__
return self._check_caught_exception(
File "/usr/local/lib/python3.10/site-packages/botocore/retryhandler.py", line 416, in _check_caught_exception
raise caught_exception
File "/usr/local/lib/python3.10/site-packages/botocore/endpoint.py", line 281, in _do_get_response
http_response = self._send(request)
File "/usr/local/lib/python3.10/site-packages/botocore/endpoint.py", line 377, in _send
return self.http_session.send(request)
File "/usr/local/lib/python3.10/site-packages/botocore/httpsession.py", line 494, in send
raise EndpointConnectionError(endpoint_url=request.url, error=e)
botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "<https://flyte-exaris.s3.amazonaws.com/?list-type=2&prefix=exaris-dev%2Fdevelopment%2FAFHNPRZZHP5G2Z6PQZPJZ5BTQI%3D%3D%3D%3D%3D%3D%2Ffast0d58d290717094dfcabe1d0e8ccbfb63.tar.gz%2F&delimiter=%2F&encoding-type=url>"
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/bin/pyflyte-fast-execute", line 8, in <module>
sys.exit(fast_execute_task_cmd())
File "/usr/local/lib/python3.10/site-packages/click/core.py", line 1130, in __call__
return self.main(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/click/core.py", line 1055, in main
rv = self.invoke(ctx)
File "/usr/local/lib/python3.10/site-packages/click/core.py", line 1404, in invoke
return ctx.invoke(self.callback, **ctx.params)
File "/usr/local/lib/python3.10/site-packages/click/core.py", line 760, in invoke
return __callback(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/flytekit/bin/entrypoint.py", line 497, in fast_execute_task_cmd
_download_distribution(additional_distribution, dest_dir)
File "/usr/local/lib/python3.10/site-packages/flytekit/tools/fast_registration.py", line 111, in download_distribution
FlyteContextManager.current_context().file_access.get_data(additional_distribution, os.path.join(destination, ""))
File "/usr/local/lib/python3.10/site-packages/flytekit/core/data_persistence.py", line 303, in get_data
raise FlyteAssertion(
flytekit.exceptions.user.FlyteAssertion: Failed to get data from <s3://flyte-exaris/exaris-dev/development/AFHNPRZZHP5G2Z6PQZPJZ5BTQI======/fast0d58d290717094dfcabe1d0e8ccbfb63.tar.gz> to /root/ (recursive=False).
Original exception: Could not connect to the endpoint URL: "<https://flyte-exaris.s3.amazonaws.com/?list-type=2&prefix=exaris-dev%2Fdevelopment%2FAFHNPRZZHP5G2Z6PQZPJZ5BTQI%3D%3D%3D%3D%3D%3D%2Ffast0d58d290717094dfcabe1d0e8ccbfb63.tar.gz%2F&delimiter=%2F&encoding-type=url>"
Weird is really that many tasks do have no problem with reaching s3, but some do have the problem. All task-pods run in the same SG, subnet, IAM Role.
This is happening with larger numbers of tasks, but also when there are smaller numbers of tasks in parallel and therefore less calls to s3.
CoreDNS pods are not failing.
I hunted for a source of the fault for a couple of hours already, maybe somebody has a hint or idea for me.freezing-airport-6809
freezing-airport-6809
freezing-airport-6809
freezing-airport-6809