I have an app that is pulling data to populate the app from a Box account using an API request. Each call is meant to download the content of a json (usually about ~1MB in size). When I run the app locally the call back with this API request takes about 2-3 seconds but when I deploy the app on Heroku these times can be anywhere from 5 seconds to many minutes. I am attempting to solve this issue by chunking the download and having the chunks download in parallel threads using asyncio. This implementation currently runs, but I don’t believe this is the correct way to implement the asynchronous functionality. Anyone have experience with this?
@app.callback(
[Output('raw-data-store', 'data', allow_duplicate=True),
Output("session-timeout-modal", "is_open", allow_duplicate=True)],
[Input(f'patient-id-input_{cancer_type}_{task}', 'value'),
Input('authenticated', 'data')],
[State('box_access_token', 'data'),
State('box_refresh_token', 'data'),
State('box_token_expiration', 'data')],
prevent_initial_call=True
)
def get_data(patient_id, authenticated, access_token, refresh_token, token_exp_time):
start_time = datetime.now()
if patient_id:
if access_token and refresh_token and datetime.now() < datetime.strptime(token_exp_time, "%Y-%m-%d %H:%M:%S.%f") and authenticated:
# Create a Box client with the obtained access token
client = boxsdk.Client(boxsdk.OAuth2(
client_id=client_id,
client_secret=client_secret,
access_token=access_token,
refresh_token=refresh_token
))
# Get file with data
with open(f'pages/{cancer_type}/patient_file_ids_{cancer_type}.json', 'r') as json_file:
file_ids = json.load(json_file)
file_id = file_ids[str(patient_id)]
if file_id is None:
return None, no_update
# Define chunk size and number of chunks
chunk_size = 100000
total_file_size = client.file(file_id).get().size
num_chunks = total_file_size // chunk_size + 1
# Create tasks for downloading each chunk asynchronously
async def download_all_chunks():
# Function to get content of a specific file by file_id
async def get_file_content_async(box_client, file_id, start_byte, end_byte):
# Get the file content as bytes for the specified range
def get_content(start_byte, end_byte):
return box_client.file(file_id).content(byte_range=(start_byte, end_byte))
file_content_bytes_coroutine = asyncio.to_thread(get_content, start_byte, end_byte)
file_content_bytes = await file_content_bytes_coroutine
return file_content_bytes
tasks = [
get_file_content_async(client, file_id, i * chunk_size,
min((i + 1) * chunk_size - 1, total_file_size - 1)) for i in range(num_chunks)
]
return await asyncio.gather(*tasks)
file_data_bytes = asyncio.run(download_all_chunks())
combined_file_data = bytes().join(file_data_bytes)
combined_file_data = combined_file_data.decode(chardet.detect(combined_file_data)['encoding'])
# get file content ???
file_content = json.loads(combined_file_data)
end_time = datetime.now()
time_difference = end_time - start_time
difference_in_seconds = time_difference.total_seconds()
print(f"Chunk size: {chunk_size}, Run time: {difference_in_seconds} seconds")
return file_content, no_update
else:
os.unsetenv('ACCESS_TOKEN')
os.unsetenv('REFRESH_TOKEN')
os.unsetenv('TOKEN_EXP_TIME')
return None, True
return None, False