To read a large ZIP file from an S3 bucket, split it into smaller ZIP files, and save those directly to the S3 bucket, you’ll need to use the boto3 library to interact with AWS S3 and the zipfile module to handle the ZIP files. You’ll also need io to create in-memory file-like objects. Here’s an example:
First, make sure you have the boto3 library installed:
pip install boto3
Source code
import boto3
import zipfile
import io
def split_zip_file(s3, bucket_name, zip_key, output_prefix, chunk_size_mb):
chunk_size_bytes = chunk_size_mb * 1024 * 1024
# Download the large ZIP file from the S3 bucket
obj = s3.Object(bucket_name, zip_key)
with io.BytesIO(obj.get()['Body'].read()) as data:
# Read the ZIP file
with zipfile.ZipFile(data) as zip_ref:
current_chunk_size = 0
current_zip_buffer = io.BytesIO()
current_zip = zipfile.ZipFile(current_zip_buffer, 'w', zipfile.ZIP_DEFLATED)
for file in zip_ref.filelist:
if current_chunk_size + file.file_size > chunk_size_bytes:
# Save and close the current chunk
current_zip.close()
# Upload the chunk to S3
current_zip_buffer.seek(0)
s3.Object(bucket_name, f"{output_prefix}/chunk_{chunk_counter}.zip").put(Body=current_zip_buffer)
# Reset current chunk
current_chunk_size = 0
current_zip_buffer = io.BytesIO()
current_zip = zipfile.ZipFile(current_zip_buffer, 'w', zipfile.ZIP_DEFLATED)
# Add file to the current chunk
current_zip.writestr(file.filename, zip_ref.read(file.filename))
current_chunk_size += file.file_size
# Save and upload the last chunk
current_zip.close()
current_zip_buffer.seek(0)
s3.Object(bucket_name, f"{output_prefix}/chunk_{chunk_counter}.zip").put(Body=current_zip_buffer)
# Configure the AWS credentials and client
AWS_ACCESS_KEY_ID = 'your_access_key_id'
AWS_SECRET_ACCESS_KEY = 'your_secret_access_key'
REGION_NAME = 'your_region_name'
s3 = boto3.resource(
's3',
region_name=REGION_NAME,
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY
)
# Set the S3 bucket and object details
bucket_name = 'your_bucket_name'
zip_key = 'path/to/large_zip_file.zip'
output_prefix = 'path/to/output_folder'
# Call the split function
chunk_size_mb = 10 # Adjust the chunk size in MB as needed
split_zip_file(s3, bucket_name, zip_key, output_prefix, chunk_size_mb)
Replace the placeholders with your AWS access key, secret key, region, bucket name, and the key (path) of the large ZIP file in the S3 bucket. Adjust the chunk_size_mb variable to set the desired size for the smaller ZIP files. The smaller ZIP files will be saved in the specified output_prefix folder in the S3 bucket.