quiet-manchester-70818
01/22/2025, 2:58 PMExpected checksum 96ZInw== did not match calculated checksum: GBZQVA==
Any way to solve this issue? Posting the traceback in the replyquiet-manchester-70818
01/22/2025, 2:59 PMaverage-finland-92144
01/22/2025, 6:15 PMboto3
to interact with S3 (or S3-compliant services like the minio instance that ships with the demo cluster) and this error seems to come from there.
Not sure if it's a limitation of the minio instance and for those large files would be better to move to flyte-binary and an S3 bucket?average-finland-92144
01/22/2025, 6:17 PMfreezing-airport-6809
freezing-airport-6809
quiet-manchester-70818
01/23/2025, 8:23 AMquiet-manchester-70818
01/23/2025, 8:24 AMfreezing-airport-6809
fierce-monitor-77717
02/06/2025, 3:53 PMFlyteFile
if possiblefreezing-airport-6809
freezing-airport-6809
freezing-airport-6809
freezing-airport-6809
fierce-monitor-77717
02/09/2025, 7:36 AMTrace:
Traceback (most recent call last):
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/flytekit/core/base_task.py", line 741, in dispatch_execute
native_inputs = self._literal_map_to_python_input(input_literal_map, exec_ctx)
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/flytekit/core/base_task.py", line 610, in _literal_map_to_python_input
return TypeEngine.literal_map_to_kwargs(ctx, literal_map, self.python_interface.inputs)
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/flytekit/core/utils.py", line 312, in wrapper
return func(*args, **kwargs)
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/flytekit/core/type_engine.py", line 1488, in literal_map_to_kwargs
return synced(ctx, lm, python_types, literal_types)
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/flytekit/utils/asyn.py", line 100, in wrapped
return self.run_sync(coro_func, *args, **kwargs)
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/flytekit/utils/asyn.py", line 93, in run_sync
return self._runner_map[name].run(coro)
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/flytekit/utils/asyn.py", line 72, in run
res = fut.result(None)
File "/opt/micromamba/envs/runtime/lib/python3.10/concurrent/futures/_base.py", line 458, in result
return self.__get_result()
File "/opt/micromamba/envs/runtime/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
raise self._exception
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/flytekit/core/type_engine.py", line 1526, in _literal_map_to_kwargs
await asyncio.gather(*kwargs.values())
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/flytekit/core/type_engine.py", line 1450, in async_to_python_value
pv = await transformer.async_to_python_value(ctx, lv, expected_python_type)
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/flytekit/types/structured/structured_dataset.py", line 1026, in async_to_python_value
return self.open_as(ctx, lv.scalar.structured_dataset, df_type=expected_python_type, updated_metadata=metad)
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/flytekit/types/structured/structured_dataset.py", line 1061, in open_as
result = decoder.decode(ctx, sd, updated_metadata)
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/flytekit/types/structured/basic_dfs.py", line 137, in decode
return pd.read_parquet(uri, columns=columns, storage_options=kwargs)
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/pandas/io/parquet.py", line 667, in read_parquet
return impl.read(
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/pandas/io/parquet.py", line 274, in read
pa_table = self.api.parquet.read_table(
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/pyarrow/parquet/core.py", line 1793, in read_table
dataset = ParquetDataset(
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/pyarrow/parquet/core.py", line 1371, in __init__
self._dataset = ds.dataset(path_or_paths, filesystem=filesystem,
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/pyarrow/dataset.py", line 794, in dataset
return _filesystem_dataset(source, **kwargs)
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/pyarrow/dataset.py", line 486, in _filesystem_dataset
return factory.finish(schema)
File "pyarrow/_dataset.pyx", line 3126, in pyarrow._dataset.DatasetFactory.finish
File "pyarrow/error.pxi", line 155, in pyarrow.lib.pyarrow_internal_check_status
File "pyarrow/error.pxi", line 89, in pyarrow.lib.check_status
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/fsspec/spec.py", line 2083, in read
out = self.cache._fetch(self.loc, self.loc + length)
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/fsspec/caching.py", line 249, in _fetch
self.cache = self.fetcher(start, end) # new block replaces old
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/s3fs/core.py", line 2359, in _fetch_range
return _fetch_range(
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/s3fs/core.py", line 2531, in _fetch_range
return sync(fs.loop, _inner_fetch, fs, bucket, key, version_id, start, end, req_kw)
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/fsspec/asyn.py", line 103, in sync
raise return_result
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/fsspec/asyn.py", line 56, in _runner
result[0] = await coro
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/s3fs/core.py", line 2549, in _inner_fetch
return await _error_wrapper(_call_and_read, retries=fs.retries)
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/s3fs/core.py", line 146, in _error_wrapper
raise err
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/s3fs/core.py", line 114, in _error_wrapper
return await func(*args, **kwargs)
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/s3fs/core.py", line 2545, in _call_and_read
return await resp["Body"].read()
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/aiobotocore/httpchecksum.py", line 58, in read
self._validate_checksum()
File "/opt/micromamba/envs/runtime/lib/python3.10/site-packages/aiobotocore/httpchecksum.py", line 67, in _validate_checksum
raise FlexibleChecksumError(error_msg=error_msg)
botocore.exceptions.FlexibleChecksumError: Expected checksum ++UuwQ== did not match calculated checksum: g1Ltrg==
Message:
FlexibleChecksumError: Expected checksum ++UuwQ== did not match calculated checksum: g1Ltrg==
All of the tasks are using the following container spec:
import os
from flytekit import ImageSpec
requirements_path = os.path.join(os.path.dirname(__file__), "..", "..", "requirements.txt")
default_image_spec = ImageSpec(
requirements=requirements_path,
python_version="3.10",
registry="localhost:30000"
)
here are my requirements file:
flytekit>=1.5.0
flytekitplugins-mlflow
flytekitplugins-deck-standard
numpy
pydantic<2.0.0
scikit-learn>=0.24.1,<= 1.6.1
snowflake-connector-python[pandas]
fastparquet>=2024.11.0
scikit-plot==0.3.7
matplotlib==3.9.2
seaborn==0.13.2
shap>=0.41.0
scipy==1.11.4
catboost==1.2.7
You can use the following code to generate the dataframe:
def random_string(length=10):
return ''.join(random.choices(string.ascii_letters + string.digits, k=length))
@task(container_image=default_image_spec)
def generate_dataframe() -> pd.DataFrame:
num_rows = 3000
num_cols = 200
data = {}
for i in range(num_cols):
col_type = i % 5 # Cycle through different data types
if col_type == 0:
data[f"int_col_{i}"] = np.random.randint(0, 1000, num_rows)
elif col_type == 1:
data[f"float_col_{i}"] = np.random.rand(num_rows) * 100
elif col_type == 2:
data[f"bool_col_{i}"] = np.random.choice([True, False], num_rows)
elif col_type == 3:
start_date = datetime(2020, 1, 1)
data[f"datetime_col_{i}"] = [start_date + timedelta(days=np.random.randint(0, 3650)) for _ in
range(num_rows)]
elif col_type == 4:
data[f"string_col_{i}"] = [random_string(10) for _ in range(num_rows)]
df = pd.DataFrame(data)
return df
Do you have any insights into why this error might be occurring?fierce-monitor-77717
02/13/2025, 9:22 AMfreezing-airport-6809
damp-lion-88352
02/13/2025, 4:01 PMdamp-lion-88352
02/13/2025, 4:10 PMdamp-lion-88352
02/13/2025, 4:10 PMdamp-lion-88352
02/13/2025, 4:10 PMdamp-lion-88352
02/13/2025, 4:10 PMimport os
import random
import string
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from flytekit import ImageSpec, task
requirements_path = "/Users/future-outlier/code/dev/flytekit/build/projects/oss-image-checksum-error/requirements.txt"
default_image_spec = ImageSpec(
requirements=requirements_path,
python_version="3.10",
registry="futureoutlier",
)
def random_string(length=10):
return ''.join(random.choices(string.ascii_letters + string.digits, k=length))
@task(container_image=default_image_spec)
def generate_dataframe() -> pd.DataFrame:
num_rows = 3000
num_cols = 200
data = {}
for i in range(num_cols):
col_type = i % 5 # Cycle through different data types
if col_type == 0:
data[f"int_col_{i}"] = np.random.randint(0, 1000, num_rows)
elif col_type == 1:
data[f"float_col_{i}"] = np.random.rand(num_rows) * 100
elif col_type == 2:
data[f"bool_col_{i}"] = np.random.choice([True, False], num_rows)
elif col_type == 3:
start_date = datetime(2020, 1, 1)
data[f"datetime_col_{i}"] = [start_date + timedelta(days=np.random.randint(0, 3650)) for _ in
range(num_rows)]
elif col_type == 4:
data[f"string_col_{i}"] = [random_string(10) for _ in range(num_rows)]
df = pd.DataFrame(data)
return df
damp-lion-88352
02/13/2025, 4:11 PMflytekit>=1.5.0
flytekitplugins-mlflow
flytekitplugins-deck-standard
numpy
pydantic<2.0.0
scikit-learn>=0.24.1,<= 1.6.1
snowflake-connector-python[pandas]
fastparquet>=2024.11.0
scikit-plot==0.3.7
matplotlib==3.9.2
seaborn==0.13.2
shap>=0.41.0
scipy==1.11.4
catboost==1.2.7
pandas
numpy
damp-lion-88352
02/13/2025, 4:11 PMdamp-lion-88352
02/13/2025, 4:20 PMfierce-monitor-77717
02/16/2025, 12:14 PMdamp-lion-88352
02/16/2025, 1:39 PMdamp-lion-88352
02/16/2025, 1:39 PMfierce-monitor-77717
02/16/2025, 3:21 PMdamp-lion-88352
02/16/2025, 3:21 PMdamp-lion-88352
02/16/2025, 3:21 PMdamp-lion-88352
02/16/2025, 3:21 PMquiet-manchester-70818
02/17/2025, 8:04 AM