aws_credentials = {
    "region_name": 'ap-northeast-2',
    "aws_access_key_id": '******************',
    "aws_secret_access_key": '******************',
}

s3 = boto3.client('s3', **aws_credentials)

S3 Large File Upload

First Attempt

temp file에 작성 후 s3에 한번에 upload
elapsed time: 269.40 seconds

total_documents = collection.count_documents({})
num_chunks = (total_documents // chunk_size) + 1

table_name = db_name + "_" + collection_name
temp_file = f"temp_file.json"

with open(temp_file, 'w') as f:
    for i in range(num_chunks):
        skip = i * chunk_size
        cursor = collection.find().skip(skip).limit(chunk_size)

        for document in cursor:
            json.dump(document, f, default=make_serializable)
            f.write('\\n')

s3.upload_file(file_name, 'etl-cdc-s3', table_name)

Second Attempt

Direct Streaming to S3
elapsed time: 291.54 seconds

직접 멀티파트 업로드 설명:

멀티파트 업로드 초기화: create_multipart_upload를 사용하여 멀티파트 업로드를 초기화합니다.
각 파트 업로드: upload_part 메서드를 사용하여 파일을 여러 파트로 나누어 업로드합니다.
업로드 완료: 모든 파트가 업로드되면 complete_multipart_upload를 호출하여 업로드를 완료합니다.
오류 처리: 업로드 중 오류가 발생하면 abort_multipart_upload로 업로드를 중단합니다.

def upload_chunk_to_s3(s3, bucket_name, key, chunk_data, part_number, mpu_id):
    part = s3.upload_part(
        Body=chunk_data,
        Bucket=bucket_name,
        Key=key,
        PartNumber=part_number,
        UploadId=mpu_id,
    )
    return {'PartNumber': part_number, 'ETag': part['ETag']}

mpu = s3.create_multipart_upload(Bucket=bucket_name, Key=key)
parts = []

cursor = collection.find()
chunk_data = ''
part_number = 1
min_chunk_size = 5 * 1024 * 1024  # 5 MB in bytes

for document in cursor:
    chunk_data += json.dumps(document, default=make_serializable, ensure_ascii=False) + '\\n'
		
		# When data meets the s3 direct streaming minumum allowed size (5MB)
    if len(chunk_data) >= min_chunk_size:
        part = upload_chunk_to_s3(s3, bucket_name, key, chunk_data, part_number, mpu['UploadId'])
        parts.append(part)
        chunk_data = ''
        part_number += 1

# Upload remaining data
if chunk_data:
    part = upload_chunk_to_s3(s3, bucket_name, key, chunk_data, part_number, mpu['UploadId'])
    parts.append(part)

# Complete multipart upload
s3.complete_multipart_upload(
    Bucket=bucket_name,
    Key=key,
    UploadId=mpu['UploadId'],
    MultipartUpload={'Parts': parts}
)

An error occurred (EntityTooSmall) when calling the CompleteMultipartUpload operation: Your proposed upload is smaller than the minimum allowed size. This happens when AWS S3 checks the sizes of all the parts that were uploaded and finds that one or more of them are smaller than the minimum allowed size of 5MB(except for the last part, which can be smaller).

S3 Large File Upload

First Attempt

Second Attempt

Third Attempt