import os
import glob
from datetime import datetime, timedelta
import numpy as np
from netCDF4 import Dataset
#############################
'''
This module usse to import files.
List of functions:
* build_file_list: Build a list of NetCDF file paths between two dates.
* import_4D: import 3D file in time series (so it is 4D in output)
* import_3D: import 2D file in time series (3D in output)
* import_layer: import a layer of 3D file in time series (3D in output)
* import_surface: import the surface layer of 3D file in time series (3D in output)
* import_depth: import data at the specified depth from an 3D file in time series (3D in output)
Features:
* Automatically load the correct grid indice (_u, _v, _w, ...)
* For multiple file, it will check if all the file is available or not, before really load the file to avoid crash in the middle.
* You can choose to stop the script if any file is missing, or fill the data on that date with nan value
'''
#############################
def _missing_not_allowed(ignore_missing):
return ignore_missing is False or str(ignore_missing).lower() == 'false'
def _raise_if_missing(file_list):
if "" in file_list:
raise FileNotFoundError(
"Missing input files. Set ignore_missing=True to fill missing dates with NaN."
)
[docs]
def build_file_list(path, tstart, tend):
"""
Build a list of NetCDF file paths between two dates.
Parameters
----------
path : str
Directory containing the NetCDF files.
tstart : datetime
Start date (inclusive).
tend : datetime
End date (inclusive).
Returns
-------
list of str
A list of file paths, one entry per day between tstart and tend.
- If a file exists, the full path is included.
- If a file is missing, the entry is an empty string "".
Raises
------
FileNotFoundError
If the first file cannot be found for tstart.
RuntimeError
If the filename does not contain the expected date string.
"""
# Step 1: locate the first file to extract the filename pattern
first_pattern = os.path.join(path, tstart.strftime("%Y%m%d") + "*")
hits = glob.glob(first_pattern)
if len(hits) == 0:
raise FileNotFoundError(f"No file found for {tstart.strftime('%Y%m%d')} in {path}")
# Step 2: extract prefix and suffix from the first file
first_file = os.path.basename(hits[0])
date_str = tstart.strftime("%Y%m%d")
try:
prefix, suffix = first_file.split(date_str, 1)
except ValueError:
raise RuntimeError(f"Cannot split date string {date_str} from filename {first_file}")
# Step 3: generate the file list for each day
flist = []
t = tstart
while t <= tend:
fname = prefix + t.strftime("%Y%m%d") + suffix
fpath = os.path.join(path, fname)
# If file exists, add it; otherwise, append an empty string
if os.path.exists(fpath):
flist.append(fpath)
else:
print(f"Missing file for {t.strftime('%Y-%m-%d')}: {fpath}")
flist.append("") # keep the position consistent with date
t += timedelta(days=1)
return flist
#############################
[docs]
def import_4D(path, var, tstart, tend, ignore_missing='False'):
"""
Import a 4D variable from a sequence of daily NetCDF files.
Parameters
----------
path : str
Directory containing the NetCDF files.
var : str
Variable name to read from each file (e.g., 'veloc_u_t').
tstart : datetime
Start date (inclusive).
tend : datetime
End date (inclusive).
ignore_missing : str, optional
If 'False' (default), the function exits when a file is missing.
If 'True', missing days are allowed and filled with NaN.
Returns
-------
numpy.ndarray
A 4D array with shape (ntime, nz, ny, nx), dtype float64.
Missing files are represented with NaN values.
"""
duration = tend - tstart
# Build the file list (length always equals number of days between tstart and tend).
# Missing files are represented as empty strings "".
file_list = build_file_list(path, tstart, tend)
# If ignore_missing is 'False' and at least one file is missing → stop execution.
if _missing_not_allowed(ignore_missing):
_raise_if_missing(file_list)
# Open the grid file to determine depth dimensions
grid = os.path.join(path, 'grid.nc')
with Dataset(grid, 'r') as fgrid:
try:
depth_t = fgrid.variables['depth_%s' % (var[-1])][:]
except KeyError:
print('Could not find a grid suffix for %s. Using _t as default.' % (var))
depth_t = fgrid.variables['depth_t'][:]
# Prepare output array filled with zeros
# Shape: [time, depth_z, depth_y, depth_x]
print('Processing path: %s at %s' % (path, datetime.now()))
data_array = np.zeros(
(duration.days + 1, np.size(depth_t, 0), np.size(depth_t, 1), np.size(depth_t, 2)),
dtype='float64'
)
# Loop through all files in the list
for i, fpath in enumerate(file_list):
tnow = tstart + timedelta(days=i)
# Print the filename on the first day of each month (if available)
if tnow.day == 1:
if fpath:
print(fpath)
# If the file exists, read the variable into the output array
if fpath:
with Dataset(fpath, 'r') as file1:
data_array[i, :, :, :] = np.ma.filled(
np.squeeze(file1.variables[var][:, :, :, :]), np.nan
)
# If the file is missing, fill with NaN values
if not fpath:
print(('File not found for:', str(tnow)), 'Missing values will be filled with NaN')
data_array[i, :, :, :] = np.nan
print('Import completed.')
return data_array
#############################
[docs]
def import_3D(path, var, tstart, tend, ignore_missing='False'):
"""
Import a 3D variable from a sequence of daily NetCDF files.
Parameters
----------
path : str
Directory containing the NetCDF files.
var : str
Variable name to read from each file (e.g., 'veloc_u_t').
tstart : datetime
Start date (inclusive).
tend : datetime
End date (inclusive).
ignore_missing : str, optional
If 'False' (default), the function exits when a file is missing.
If 'True', missing days are allowed and filled with NaN.
Returns
-------
numpy.ndarray
A 3D surface array with shape (ntime, ny, nx), dtype float64.
Missing files are represented with NaN values.
"""
duration = tend - tstart
# Build the file list (length always equals number of days between tstart and tend).
# Missing files are represented as empty strings "".
file_list = build_file_list(path, tstart, tend)
# If ignore_missing is 'False' and at least one file is missing → stop execution.
if _missing_not_allowed(ignore_missing):
_raise_if_missing(file_list)
# Open the grid file to determine depth dimensions
grid = os.path.join(path, 'grid.nc')
with Dataset(grid, 'r') as fgrid:
try:
depth_t = fgrid.variables['depth_%s' % (var[-1])][:]
except KeyError:
print('Could not find a grid suffix for %s. Using _t as default.' % (var))
depth_t = fgrid.variables['depth_t'][:]
# Prepare output array filled with zeros
# Shape: [time, depth_y, depth_x]
print('Processing path: %s at %s' % (path, datetime.now()))
data_array = np.zeros(
(duration.days + 1, np.size(depth_t, 1), np.size(depth_t, 2)),
dtype='float64'
)
# Loop through all files in the list
for i, fpath in enumerate(file_list):
tnow = tstart + timedelta(days=i)
# Print the filename on the first day of each month (if available)
if tnow.day == 1:
if fpath:
print(fpath)
# If the file exists, read the variable into the output array
if fpath:
with Dataset(fpath, 'r') as file1:
data_array[i, :, :] = np.ma.filled(
np.squeeze(file1.variables[var][ :, :, :]), np.nan
)
# If the file is missing, fill with NaN values
if not fpath:
print(('File not found for:', str(tnow)), 'Missing values will be filled with NaN')
data_array[i, :, :] = np.nan
print('Import completed.')
return data_array
#############################
[docs]
def import_surface(path, var, tstart, tend, ignore_missing='False'):
"""
Import a surface variable from a sequence of daily NetCDF files.
Parameters
----------
path : str
Directory containing the NetCDF files.
var : str
Variable name to read from each file (e.g., 'veloc_u_t').
tstart : datetime
Start date (inclusive).
tend : datetime
End date (inclusive).
ignore_missing : str, optional
If 'False' (default), the function exits when a file is missing.
If 'True', missing days are allowed and filled with NaN.
Returns
-------
numpy.ndarray
A 3D array with shape (ntime, ny, nx), dtype float64.
Missing files are represented with NaN values.
"""
duration = tend - tstart
# Build the file list (length always equals number of days between tstart and tend).
# Missing files are represented as empty strings "".
file_list = build_file_list(path, tstart, tend)
# If ignore_missing is 'False' and at least one file is missing → stop execution.
if _missing_not_allowed(ignore_missing):
_raise_if_missing(file_list)
# Open the grid file to determine depth dimensions
grid = os.path.join(path, 'grid.nc')
with Dataset(grid, 'r') as fgrid:
try:
depth_t = fgrid.variables['depth_%s' % (var[-1])][:]
except KeyError:
print('Could not find a grid suffix for %s. Using _t as default.' % (var))
depth_t = fgrid.variables['depth_t'][:]
# Prepare output array filled with zeros
# Shape: [time, depth_z, depth_y, depth_x]
print('Processing path: %s at %s' % (path, datetime.now()))
data_array = np.zeros((duration.days + 1, np.size(depth_t, 1), np.size(depth_t, 2)), dtype='float64')
# Loop through all files in the list
for i, fpath in enumerate(file_list):
tnow = tstart + timedelta(days=i)
# Print the filename on the first day of each month (if available)
if tnow.day == 1:
if fpath:
print(fpath)
# If the file exists, read the variable into the output array
if fpath:
with Dataset(fpath, 'r') as file1:
data_array[i, :, :] = np.ma.filled(
np.squeeze(file1.variables[var][:, -1, :, :]), np.nan
)
# If the file is missing, fill with NaN values
if not fpath:
print(('File not found for:', str(tnow)), 'Missing values will be filled with NaN')
data_array[i, :, :] = np.nan
print('Import completed.')
return data_array
#############################
[docs]
def import_layer(path, var, tstart, tend, layer, ignore_missing='False'):
"""
Import a surface variable from a sequence of daily NetCDF files.
Parameters
----------
path : str
Directory containing the NetCDF files.
var : str
Variable name to read from each file (e.g., 'veloc_u_t').
tstart : datetime
Start date (inclusive).
tend : datetime
End date (inclusive).
layer: int
The layer to import
ignore_missing : str, optional
If 'False' (default), the function exits when a file is missing.
If 'True', missing days are allowed and filled with NaN.
Returns
-------
numpy.ndarray
A 3D array with shape (ntime, ny, nx), dtype float64.
Missing files are represented with NaN values.
"""
duration = tend - tstart
# Build the file list (length always equals number of days between tstart and tend).
# Missing files are represented as empty strings "".
file_list = build_file_list(path, tstart, tend)
# If ignore_missing is 'False' and at least one file is missing → stop execution.
if _missing_not_allowed(ignore_missing):
_raise_if_missing(file_list)
# Open the grid file to determine depth dimensions
grid = os.path.join(path, 'grid.nc')
with Dataset(grid, 'r') as fgrid:
try:
depth_t = fgrid.variables['depth_%s' % (var[-1])][:]
except KeyError:
print('Could not find a grid suffix for %s. Using _t as default.' % (var))
depth_t = fgrid.variables['depth_t'][:]
# Prepare output array filled with zeros
# Shape: [time, depth_z, depth_y, depth_x]
print('Processing path: %s at %s' % (path, datetime.now()))
data_array = np.zeros((duration.days + 1, np.size(depth_t, 1), np.size(depth_t, 2)), dtype='float64')
# Loop through all files in the list
for i, fpath in enumerate(file_list):
tnow = tstart + timedelta(days=i)
# Print the filename on the first day of each month (if available)
if tnow.day == 1:
if fpath:
print(fpath)
# If the file exists, read the variable into the output array
if fpath:
with Dataset(fpath, 'r') as file1:
data_array[i, :, :] = np.ma.filled(
np.squeeze(file1.variables[var][:, layer, :, :]), np.nan
)
# If the file is missing, fill with NaN values
if not fpath:
print(('File not found for:', str(tnow)), 'Missing values will be filled with NaN')
data_array[i, :, :] = np.nan
print('Import completed.')
return data_array
#############################
[docs]
def import_depth(path, var, tstart, tend, depth, ignore_missing='False'):
"""
Import a variable in specified depth from daily NetCDF files.
Parameters
----------
path : str
Directory containing the NetCDF files.
var : str
Variable name to read from each file (e.g., 'veloc_u_t').
tstart : datetime
Start date (inclusive).
tend : datetime
End date (inclusive).
depth: float
The depth to import
ignore_missing : str, optional
If 'False' (default), the function exits when a file is missing.
If 'True', missing days are allowed and filled with NaN.
Returns
-------
numpy.ndarray
A 3D array with shape (ntime, ny, nx), dtype float64.
Missing files are represented with NaN values.
"""
duration = tend - tstart
# Build the file list (length always equals number of days between tstart and tend).
# Missing files are represented as empty strings "".
file_list = build_file_list(path, tstart, tend)
# If ignore_missing is 'False' and at least one file is missing → stop execution.
if _missing_not_allowed(ignore_missing):
_raise_if_missing(file_list)
# Open the grid file to determine depth dimensions
grid = os.path.join(path, 'grid.nc')
with Dataset(grid, 'r') as fgrid:
try:
depth_t = fgrid.variables['depth_%s' % (var[-1])][:]
mask_ = fgrid.variables['mask_%s' % (var[-1])][:]
except KeyError:
print('Could not find a grid suffix for %s. Using _t as default.' % (var))
depth_t = fgrid.variables['depth_t'][:]
mask_ = fgrid.variables['mask_t'][:]
if mask_.ndim == 3:
mask_t = mask_[0, :, :]
elif mask_.ndim == 2:
mask_t = np.copy(mask_)
print (depth_t.shape, mask_t.shape)
# Prepare multiply array
if depth > 0:
depth = depth * -1
# Find the indice of the min and max
# For example, the interest depth = 5, => -5 . the depth of two layer is -3 and -10.
toto= np.ma.masked_where(depth_t>depth, depth_t)
max_array= np.argmax(toto, axis=0) #because it is negative number, so it will return the nearest layer < depth (-10)
toto= np.ma.masked_where(depth_t<depth, depth_t)
min_array= np.argmin(toto, axis=0) #because it is negative number, so it will return the nearest layer > depth (-3)
# Calculate the multiply factor:
multiply_array=np.zeros((np.size(depth_t,0),np.size(depth_t,1),np.size(depth_t,2)),dtype='float64')
for i in range(0, np.size(depth_t,1)):
for j in range(0, np.size(depth_t,2)):
if min_array[i,j] != max_array[i,j]: #only take into account the point that have min and max indice
dis_tance = depth_t[max_array[i,j],i,j]-depth_t[min_array[i,j],i,j]
multiply_array[max_array[i,j],i,j] = 1 + (depth-depth_t[max_array[i,j],i,j])/dis_tance #1+ -5/7
multiply_array[min_array[i,j],i,j] = 1 - (depth-depth_t[min_array[i,j],i,j])/dis_tance #1- 2/7
check_depth_array=np.zeros(( np.size(depth_t,1), np.size(depth_t,2) ),dtype='float64')
for i in range(0, np.size(depth_t,1)):
for j in range(0, np.size(depth_t,2)):
if min_array[i,j]!=max_array[i,j]: #only take into account the point that have min and max indice
check_depth_array[i,j]=1
# Prepare output array filled with zeros
# Shape: [time, depth_z, depth_y, depth_x]
print('Processing path: %s at %s' % (path, datetime.now()))
data_array = np.zeros((duration.days + 1, np.size(depth_t, 1), np.size(depth_t, 2)), dtype='float64')
# Loop through all files in the list
for i, fpath in enumerate(file_list):
tnow = tstart + timedelta(days=i)
# Print the filename on the first day of each month (if available)
if tnow.day == 1:
if fpath:
print(fpath)
# If the file exists, read the variable into the output array
if fpath:
with Dataset(fpath, 'r') as file1:
data_toto = np.ma.filled(
np.squeeze(file1.variables[var][:,:,:,:]), np.nan
)
data_toto2 = np.nansum(data_toto * multiply_array, axis=0) #BE CAREFUL. NANSUM WILL RETURN 0 IF ALL NAN IN CALCULATION
data_toto2[np.isnan(data_toto[0,:,:])] = np.nan #filter all original nanvalue to be nan
data_toto2[check_depth_array==0] = np.nan
data_toto2[mask_t==0] = np.nan # mask land - sea value
data_array[i,:,:] = np.copy(data_toto2)
# If the file is missing, fill with NaN values
if not fpath:
print(('File not found for:', str(tnow)), 'Missing values will be filled with NaN')
data_array[i, :, :] = np.nan
print('Import completed.')
return data_array
#############################
[docs]
def find_nearest_index_haversine(lat, lon, lat_p, lon_p):
"""
Find the index (i, j) of the nearest grid point in 2D arrays `lat` and `lon`
to a given target coordinate (lat_p, lon_p) using great-circle (Haversine) distance.
Parameters
----------
lat : np.ndarray
2D array of latitude values (degrees).
lon : np.ndarray
2D array of longitude values (degrees).
lat_p : float
Target latitude (degrees).
lon_p : float
Target longitude (degrees).
Returns
-------
tuple
(i, j) index of the nearest grid point.
"""
# Convert degrees to radians
lat_rad = np.radians(lat)
lon_rad = np.radians(lon)
lat_p_rad = np.radians(lat_p)
lon_p_rad = np.radians(lon_p)
# Haversine formula
dlat = lat_rad - lat_p_rad
dlon = lon_rad - lon_p_rad
a = np.sin(dlat / 2.0) ** 2 + np.cos(lat_p_rad) * np.cos(lat_rad) * np.sin(dlon / 2.0) ** 2
c = 2 * np.arcsin(np.sqrt(a))
# Earth's radius (km) – absolute value of distance is enough for comparison
dist = 6371.0 * c
# Find index of minimum distance
idx_flat = np.argmin(dist)
i, j = np.unravel_index(idx_flat, lat.shape)
return i, j
[docs]
def import_point(path, var, tstart, tend, lat_j, lon_i, ji = 'False', level = -1, ignore_missing='False'):
"""
Import a data point from a variable of daily NetCDF files.
Parameters
----------
path : str
Directory containing the NetCDF files.
var : str
Variable name to read from each file (e.g., 'veloc_u_t').
tstart : datetime
Start date (inclusive).
tend : datetime
End date (inclusive).
lat_j : float or int
Latitude or j to import
lon_j : float or int
Longitude or i to import
ji : str, optional
If 'False' (default), the function will find i and j based on lat and lon provided
If 'True', the function will use i and j directly to import
level : int, optinal
level that we want to import. Only support for 3D field. Default is surface layer
ignore_missing : str, optional
If 'False' (default), the function exits when a file is missing.
If 'True', missing days are allowed and filled with NaN.
Returns
-------
numpy.ndarray
A 3D array with shape (ntime, ny, nx), dtype float64.
Missing files are represented with NaN values.
"""
duration = tend - tstart
# Build the file list (length always equals number of days between tstart and tend).
# Missing files are represented as empty strings "".
file_list = build_file_list(path, tstart, tend)
# If ignore_missing is 'False' and at least one file is missing → stop execution.
if _missing_not_allowed(ignore_missing):
_raise_if_missing(file_list)
# Open the grid file to determine depth dimensions
grid = os.path.join(path, 'grid.nc')
with Dataset(grid, 'r') as fgrid:
try:
lat_t = fgrid.variables['latitude_%s' % (var[-1])][:]
lon_t = fgrid.variables['longitude_%s' % (var[-1])][:]
except KeyError:
print('Could not find a grid suffix for %s. Using _t as default.' % (var))
lat_t = fgrid.variables['latitude_t'][:]
lon_t = fgrid.variables['longitude_t'][:]
# Prepare output array filled with zeros
# Shape: [time, depth_z, depth_y, depth_x]
print('Processing path: %s at %s' % (path, datetime.now()))
data_array = np.zeros((duration.days + 1), dtype='float64')
kount_print=0
# Loop through all files in the list
for i, fpath in enumerate(file_list):
tnow = tstart + timedelta(days=i)
# Print the filename on the first day of each month (if available)
if tnow.day == 1:
if fpath:
print(fpath)
# If the file exists, read the variable into the output array
if fpath:
with Dataset(fpath, 'r') as file1:
# Check if data is 2D or 3D
data_toto = np.squeeze(file1.variables[var][:])
data_dim = data_toto.ndim
# Case 1: load 2D var:
if data_dim ==2:
if ji == 'True':
j_ind, i_ind = int(lat_j), int(lon_i)
else:
j_ind, i_ind = find_nearest_index_haversine(lat_t, lon_t, lat_j, lon_i)
if kount_print ==0:
print ('Original location and nearest point location')
print ('Lat', lat_j, lat_t[j_ind, i_ind])
print ('Lon', lon_i, lon_t[j_ind, i_ind])
kount_print+=1
data_array[i] = np.squeeze(file1.variables[var][:, j_ind, i_ind])
# Case 2: load 3D var
elif data_dim ==3:
if ji == 'True':
j_ind, i_ind = int(lat_j), int(lon_i)
else:
j_ind, i_ind = find_nearest_index_haversine(lat_t, lon_t, lat_j, lon_i)
if kount_print ==0:
print ('Lat', lat_j, 'Nearest point', lat_t[j_ind, i_ind])
print ('Lon', lon_i, 'Nearest point', lon_t[j_ind, i_ind])
kount_print +=1
data_array[i] = np.squeeze(file1.variables[var][:, level, j_ind, i_ind])
# If the file is missing, fill with NaN values
if not fpath:
print(('File not found for:', str(tnow)), 'Missing values will be filled with NaN')
data_array[i] = np.nan
print('Import completed.')
return data_array
#############################
[docs]
def import_profile(path, var, tstart, tend, lat_j, lon_i, ji = 'False', ignore_missing='False'):
"""
Import a data point from a variable of daily NetCDF files.
Parameters
----------
path : str
Directory containing the NetCDF files.
var : str
Variable name to read from each file (e.g., 'veloc_u_t').
tstart : datetime
Start date (inclusive).
tend : datetime
End date (inclusive).
lat_j : float or int
Latitude or j to import
lon_j : float or int
Longitude or i to import
ji : str, optional
If 'False' (default), the function will find i and j based on lat and lon provided
If 'True', the function will use i and j directly to import
level : int, optinal
level that we want to import. Only support for 3D field. Default is surface layer
ignore_missing : str, optional
If 'False' (default), the function exits when a file is missing.
If 'True', missing days are allowed and filled with NaN.
Returns
-------
numpy.ndarray
A 3D array with shape (ntime, ny, nx), dtype float64.
Missing files are represented with NaN values.
"""
duration = tend - tstart
# Build the file list (length always equals number of days between tstart and tend).
# Missing files are represented as empty strings "".
file_list = build_file_list(path, tstart, tend)
# If ignore_missing is 'False' and at least one file is missing → stop execution.
if _missing_not_allowed(ignore_missing):
_raise_if_missing(file_list)
# Open the grid file to determine depth dimensions
grid = os.path.join(path, 'grid.nc')
with Dataset(grid, 'r') as fgrid:
try:
lat_t = fgrid.variables['latitude_%s' % (var[-1])][:]
lon_t = fgrid.variables['longitude_%s' % (var[-1])][:]
depth_t = fgrid.variables['depth_%s' % (var[-1])][:]
except KeyError:
print('Could not find a grid suffix for %s. Using _t as default.' % (var))
lon_t = fgrid.variables['longitude_t'][:]
lat_t = fgrid.variables['latitude_t'][:]
depth_t = fgrid.variables['depth_t'][:]
# Prepare output array filled with zeros
# Shape: [time, depth_z, depth_y, depth_x]
print('Processing path: %s at %s' % (path, datetime.now()))
data_array = np.zeros((duration.days + 1, np.size(depth_t,0)), dtype='float64')
index = np.zeros((2)) # contain index
kount_print=0
# Loop through all files in the list
for i, fpath in enumerate(file_list):
tnow = tstart + timedelta(days=i)
# Print the filename on the first day of each month (if available)
if tnow.day == 1:
if fpath:
print(fpath)
# If the file exists, read the variable into the output array
if fpath:
with Dataset(fpath, 'r') as file1:
# Check if data is 2D or 3D
data_toto = np.squeeze(file1.variables[var][:])
data_dim = data_toto.ndim
# Case 1: load 2D var:
if data_dim ==2:
raise ValueError('Data dimension = 2. Please check again...')
# Case 2: load 3D var
elif data_dim ==3:
if ji == 'True':
j_ind, i_ind = int(lat_j), int(lon_i)
else:
j_ind, i_ind = find_nearest_index_haversine(lat_t, lon_t, lat_j, lon_i)
if kount_print ==0:
print ('Original location and nearest point location')
print ('Lat', lat_j, lat_t[j_ind, i_ind])
print ('Lon', lon_i, lon_t[j_ind, i_ind])
kount_print +=1
index[0] = j_ind
index[1] = i_ind
data_array[i,:] = np.squeeze(file1.variables[var][:, :, j_ind, i_ind])
# If the file is missing, fill with NaN values
if not fpath:
print(('File not found for:', str(tnow)), 'Missing values will be filled with NaN')
data_array[i] = np.nan
print('Import completed.')
return data_array, index