##--------------------##
from matplotlib import numpy as np # matplotlib submodule for numeric arrays

#---------- READING BINARY MIROC GEOPHYSICAL DATA ----------
def CCM_readmodel(datafile, datadir):

    """
    This function reads one binary file of MIROC output (2D or 3D data).
    Input:  directory and file name (following the nomenclature of the online
            repository for the HFC data).
    Output: Python dictionary with geophysical data + ancillary information.
    ---
    NEEDS IMPORT IN THE MIROC NAMESPACE:
      - module 'numpy' for array manipulation
    ---
    AUTHOR: Eric Dupuy, National Institute for Environmental Studies (NIES)
    ---
    NOTES:
        1) Error management is not implemented: please make sure the directory
           path does not end in '/' and that both the last directory on path
           and the filenames match those of the online repository or modify
           accordingly.
        2) To simplify the multi-member analysis (100 per experiment), the
           output array dimension is standardized to 366 days; a dummy entry is
           inserted at index 59 (60th position) for non-leap years.
        3) Geophysical parameters implemented are those used in the analysis
           described in the JGR paper: atmospheric number density, atmospheric
           temperature, zonal mean zonal wind, ozone volume mixing ratio, total
           column ozone, meridional and vertical components and divergence of
           the Eliassen-Palm flux, and residual mean meridional and vertical
           wind (v*, w*).
    ---
    VERSION HISTORY:
    VERSION ver.1.0, last update: 2021-06-09 (ED, NIES)
        - adapted for external users
    """

    # ==========
    # INPUT: Initialization

    # -----
    # extract info from names of data directory and file
    parameter = datadir.split('/')[-1] # assuming no trailing separator in directory name
    year = int(datafile.split('_')[-1])
    # skip day index for non-leap years
    idx_dayskip = 59 # 29 February missing if not leap year
    numtot_records = 366 # number of "records" (= days) in binary file
                         # accounting for leap years
    # -----
    # file properties and sizes for sequential binary reading
    fltprec = np.float32 # reading precision from binary
    hdr_bytes = 1024 # header size in bytes: 64 lines x 16 chars
    elt_bytes = 4 # length of 1 elt (data/padding) in bytes
    num_padding = 4 # number of padding elements / record (delimiters)
    # header handling
    nchars_hline = 16 # number of characters per line
    nlines_header = 64 # number of lines in the header
    # date indices (binary file header-related)
    idx_sttdate = 47 # field index of start date (date at 00h00m) in header
    idx_middate = 26 # field index of middle date (date at 12h00m) in header
    idx_enddate = 48 # field index of end date (date at 00h00m next day) in header

    # -----
    # GEOLOCATION: model grid values
    # T42 resolution: constant longitude step 2.8125; latitude step ~2.8
    # (!! same order as the elements of the final data array !!)
    geolat = np.array([ 87.864,  85.097,  82.313,  79.526,  76.737,  73.948,
                        71.158,  68.368,  65.578,  62.787,  59.997,  57.207,
                        54.416,  51.626,  48.835,  46.045,  43.254,  40.464,
                        37.673,  34.883,  32.092,  29.301,  26.511,  23.720,
                        20.930,  18.139,  15.348,  12.558,   9.767,   6.977,
                         4.186,   1.395,  -1.395,  -4.186,  -6.977,  -9.767,
                       -12.558, -15.348, -18.139, -20.930, -23.720, -26.511,
                       -29.301, -32.092, -34.883, -37.673, -40.464, -43.254,
                       -46.045, -48.835, -51.626, -54.416, -57.207, -59.997,
                       -62.787, -65.578, -68.368, -71.158, -73.948, -76.737,
                       -79.526, -82.313, -85.097, -87.864,],
                       dtype = fltprec) # mean step 2.789, min 2.767, max 2.791
    geolon = np.array(np.r_[:128] * 2.8125, dtype = fltprec)
    geolev = np.array([1000., 850., 700., 500., 400., 300., 250., 200., 170.,
                       150., 130., 115., 100., 90., 80., 70., 50., 30., 20.,
                       15., 10., 7., 5., 3., 2., 1.5, 1., 0.5, 0.3, 0.2, 0.1,],
                       dtype = fltprec)
    # dimensions of the geolocation data
    numtot_levels = 31 # total number of pressure levels
    numtot_lats = 64 # total number of lat elements
    numtot_lons = 128 # total number of lon elements

    # GEOPHYSICAL PARAMETER DEFAULTS: initialize variable-related quantities
    # to defaults: full name, array dimensions, and physical unit (formatted as
    # a LaTeX string) 
    # Available geophysical parameters are
    defgeo = {'air_P':{ 'name': 'Air number density',   # var. full name
                        'fdims': '3d',                  # dims of data array
                        'unit': 'm${-3}$',              # physical quantity
                      },
              'dobson':{'name': 'Total column ozone',   # var. full name
                        'fdims': '2d',                  # dims of data array
                        'unit': 'DU',                   # physical quantity
                       },
              'epfdiv':{'name': 'Eliassen-Palm flux divergence', # var. full name
                        'fdims': '2z',                  # dims of data array
                        'unit': 'm.s$^{-2}$',           # physical quantity
                  },
              'epfy':{  'name': 'Eliassen-Palm flux vector meridional component', # var. full name
                        'fdims': '2z',                  # dims of data array
                        'unit': 'kg.m$^{-1}$.s$^{-2}$', # physical quantity
                  },
              'epfz':{  'name': 'Eliassen-Palm flux vector vertical component', # var. full name
                        'fdims': '2z',                  # dims of data array
                        'unit': 'kg.m$^{-1}$.s$^{-2}$', # physical quantity
                  },
              'T_P':{   'name': 'Temperature',          # var. full name
                        'fdims': '3d',                  # dims of data array
                        'unit': 'K',                    # physical quantity
                  },
              'u_P':{   'name': 'Zonal mean zonal wind', # var. full name
                        'fdims': '3d',                  # dims of data array
                        'unit': 'm.s$^{-1}$',           # physical quantity
                  },
              'v_sta':{ 'name': 'Residual mean meridional wind', # var. full name
                        'fdims': '2z',                  # dims of data array
                        'unit': 'm.s$^{-1}$',           # physical quantity
                  },
              'w_sta':{ 'name': 'Residual mean vertical wind', # var. full name
                        'fdims': '2z',                  # dims of data array
                        'unit': 'm.s$^{-1}$',           # physical quantity
                  },
              'xo3_P':{ 'lname': 'Ozone volume mixing ratio', # var. full name
                        'fdims': '3d',                  # dims of data array
                        'unit': 'ppv',                  # physical quantity
                   },
              }

    # -----
    # Define shape of parameter array (= "dimensions")
    # set number of data elements, arrays sizes depending on the dimensions
    nlat = numtot_lats # all latitudes are read
    dims = defgeo[parameter]['fdims']
    if (dims == '3d'): # 3D --> latitude/longitude/pressure, full output
        # total array size per record, 3D files
        nelts = numtot_lats * numtot_lons * numtot_levels
        # geolocation array effective sizes
        nlon = numtot_lons
        nlev = numtot_levels # pressure levels (=     31 )
        idx_levels = np.r_[:nlev] # pressure indices (= [0: 30])
    elif (dims == '2z'): # 2z --> latitude/pressure (zonal data, e.g., EPflux div.)
        # total array size per record, zonally averaged
        nelts = numtot_lats * numtot_levels
        # geolocation array effective sizes
        nlon = 1
        nlev = numtot_levels # pressure levels (=     31 )
        idx_levels = np.r_[:nlev] # pressure indices (= [0: 30])
    elif (dims == '2d'): # 2D --> latitude/longitude (integrated data, total O3)
        # total array size per record, vertically integrated or single-level
        nelts  = numtot_lats * numtot_lons
        # geolocation array effective sizes
        nlon = numtot_lons
        nlev = 1 # surface or integrated
        idx_levels = np.r_[0] # only one level

    # -----
    # infer number of bytes per record after counting data values
    rec_bytes = hdr_bytes  + elt_bytes * (nelts + num_padding); # rec size (== 1 day) in bytes
    # full file size in bytes: number of records (= days) x size of one record
    # account for leap years for the loop indices; effective number of records
    if (year % 400 == 0):
        nreff = numtot_records # leap year, full year --> 366 days
    elif (year % 400 != 0):
        if (year % 4 == 0):
            if (year % 100 != 0) :
                nreff = numtot_records # not divisible by 100 but divisible by 4 --> leap year
            else: # divisible by 100 (but not 400) --> not a leap year
                nreff = numtot_records-1
        else: # not divisible by either 4 or 400 --> not a leap year
            nreff = numtot_records-1

    # -----
    # Initialize output dictionary
    data_out = {'name':{'infile': parameter,
                        'full': defgeo[parameter]['name']},
                'unit': defgeo[parameter]['unit'],
                'directory': datadir,
                'filename': datafile,
                'year': year,
                'datadims': {'type': dims,
                             'array': (numtot_records, nlev, nlat, nlon)},
               }
    # Initialize temporary arrays
    # list of date dictionaries per record
    dtsfull   = []
    # list of headers as lists of strings (nlines_header lines of nchars_hline characters)
    hdrfull   = []
    # initialize temporary data array and set all values to NaN
    tmpdata   = np.ones((nreff, nlev, nlat, nlon), dtype = fltprec) * np.nan    # dummy parameters to complete for non leap years: data, header, dates
    # dummy parameters to complete for non leap years: data, header, dates
    dummyrec = np.ones((    1, nlev, nlat, nlon), dtype = fltprec) * np.nan
    dummystr    = []; dummystr.extend(('-' * nchars_hline for line in range(nlines_header)))
    dummydate   = {'start': 'n/a', 'middle': 'n/a', 'end': 'n/a'}

    #####
    # READING: Extraction from file
    # Open the binary file for sequential reading, model binary output
    with open('/'.join([datadir, datafile]), 'rb') as fbinary:
        # loop over all records (= days)
        for irec in range( nreff ):
            # set position at start of current record
            fbinary.seek(rec_bytes * irec, 0)
            # read number of bytes for one record/date (record length)
            rec_bin = fbinary.read(rec_bytes)
            # extract header as a list of strings
            hdr_strings = rec_bin[elt_bytes : hdr_bytes+elt_bytes].decode("utf-8")
            # append to list of headers
            hdrfull.append([hdr_strings[line-nchars_hline : line] for line in \
                            range(nchars_hline, len(hdr_strings) + nchars_hline, nchars_hline)])
            # append date/time strings to list of date strings (extracted from header)
            dtsfull.append({'start': hdrfull[irec][idx_sttdate],
                            'middle':hdrfull[irec][idx_middate],
                            'end':   hdrfull[irec][idx_enddate]})
            # read remainder of the record = data
            # (convert native binary to output array: swap bytes and rearrange)
            # (the padding bytes are due to the FORTRAN writing of the model
            # output into binary files; reading order is different from FORTRAN)
            recdata = np.frombuffer(
                        rec_bin[3 * elt_bytes + hdr_bytes : \
                                3 * elt_bytes + hdr_bytes + nelts * elt_bytes],
                        dtype = fltprec).byteswap().reshape( nlev, nlat, nlon )
            # set missing values (-999.0) to NaN
            recdata[recdata == -999.] = np.nan
            # assign to temporary data array
            tmpdata[irec, :, :, :] = recdata[idx_levels, :, :]

    # -----
    # assign to final array (skip 29 feb index except leap year)
    # indices 0-30: January; 31:58: Feb 01 to 28; 59: February 29 or NaN
    if nreff != numtot_records:
        fulldata = np.concatenate((tmpdata[:idx_dayskip,:,:,:],
                                   dummyrec,
                                   tmpdata[idx_dayskip:,:,:,:]), axis = 0)
        hdrfull = hdrfull[:idx_dayskip] + [dummystr ] + hdrfull[idx_dayskip:]
        dtsfull = dtsfull[:idx_dayskip] + [dummydate] + dtsfull[idx_dayskip:]

    #####
    # OUTPUT: Update output structure/dictionary
    data_out['dates'  ] = dtsfull # dates
    data_out['headers'] = hdrfull # headers
    data_out['data'   ] = fulldata # data

    # return the output
    return(data_out)