aws_credentials = {
"region_name": 'ap-northeast-2',
"aws_access_key_id": '******************',
"aws_secret_access_key": '******************',
}
s3 = boto3.client('s3', **aws_credentials)
total_documents = collection.count_documents({})
num_chunks = (total_documents // chunk_size) + 1
table_name = db_name + "_" + collection_name
temp_file = f"temp_file.json"
with open(temp_file, 'w') as f:
for i in range(num_chunks):
skip = i * chunk_size
cursor = collection.find().skip(skip).limit(chunk_size)
for document in cursor:
json.dump(document, f, default=make_serializable)
f.write('\\n')
s3.upload_file(file_name, 'etl-cdc-s3', table_name)
직접 멀티파트 업로드 설명:
create_multipart_upload를 사용하여 멀티파트 업로드를 초기화합니다.upload_part 메서드를 사용하여 파일을 여러 파트로 나누어 업로드합니다.complete_multipart_upload를 호출하여 업로드를 완료합니다.abort_multipart_upload로 업로드를 중단합니다.def upload_chunk_to_s3(s3, bucket_name, key, chunk_data, part_number, mpu_id):
part = s3.upload_part(
Body=chunk_data,
Bucket=bucket_name,
Key=key,
PartNumber=part_number,
UploadId=mpu_id,
)
return {'PartNumber': part_number, 'ETag': part['ETag']}
mpu = s3.create_multipart_upload(Bucket=bucket_name, Key=key)
parts = []
cursor = collection.find()
chunk_data = ''
part_number = 1
min_chunk_size = 5 * 1024 * 1024 # 5 MB in bytes
for document in cursor:
chunk_data += json.dumps(document, default=make_serializable, ensure_ascii=False) + '\\n'
# When data meets the s3 direct streaming minumum allowed size (5MB)
if len(chunk_data) >= min_chunk_size:
part = upload_chunk_to_s3(s3, bucket_name, key, chunk_data, part_number, mpu['UploadId'])
parts.append(part)
chunk_data = ''
part_number += 1
# Upload remaining data
if chunk_data:
part = upload_chunk_to_s3(s3, bucket_name, key, chunk_data, part_number, mpu['UploadId'])
parts.append(part)
# Complete multipart upload
s3.complete_multipart_upload(
Bucket=bucket_name,
Key=key,
UploadId=mpu['UploadId'],
MultipartUpload={'Parts': parts}
)
An error occurred (EntityTooSmall) when calling the CompleteMultipartUpload operation: Your proposed upload is smaller than the minimum allowed size. This happens when AWS S3 checks the sizes of all the parts that were uploaded and finds that one or more of them are smaller than the minimum allowed size of 5MB(except for the last part, which can be smaller).