'''Download and process all climate indices from NOAA Physical Sciences Laboratory
page: https://psl.noaa.gov/data/climateindices/list/

Author: guangzhi XU (xugzhi1987@gmail.com)
Update time: 2021-12-06 09:52:28.
'''

import os
import requests
import numpy as np
from datetime import datetime
from netCDF4 import Dataset, date2num
from bs4 import BeautifulSoup

PAGE_URL = 'https://psl.noaa.gov/data/climateindices/list/'
OUTPUTDIR = '/home/guangzhi/datasets/climate_indices'
PLOT = True   # create time series plot or not


def fetchData(row, outputdir):
    '''Fetch data of a climate index, save to netCDF format

    Args:
        row (list): info about a climate index:
            [id, name, data_url, descriptions, urls_in_descriptions].
        outputdir (str): folder to save outputs.
    Returns:
        rec (int): 0 is run successfully.
    '''

    ii, nameii, urlii, descripii, desc_urlsii = row
    print('\n# <download_climate_indices>: Processing:', nameii)

    absurlii = 'https://psl.noaa.gov/' + urlii
    filenameii = os.path.split(urlii)[1]
    abbrii = os.path.splitext(filenameii)[0]
    abbrii = abbrii.replace('/', '-')
    abbrii = abbrii.replace('\\', '-')

    #-------save each climate index in a separate subfolder---
    subfolderii = os.path.join(outputdir, abbrii)

    if not os.path.exists(subfolderii):
        os.makedirs(subfolderii)

    #--------------Get data in csv format--------------
    abspathii_data = os.path.join(subfolderii, filenameii)
    dataii = requests.get(absurlii).text
    print('Fetched data from url:', absurlii)
    print('Writing data to file:', abspathii_data)
    with open(abspathii_data, 'w') as fout:
        fout.write(dataii)

    # ----------------Write readme----------------
    descripii = descripii + '\n\nLinks:\n' + desc_urlsii
    abspathii_dsc = os.path.join(subfolderii, 'readme.txt')
    print('Writing readme to file:', abspathii_dsc)
    with open(abspathii_dsc, 'w') as fout:
        fout.write(descripii)

    # ------------------Convert to nc------------------
    try:
        convert2NC(abspathii_data, subfolderii, abbrii, isplot=PLOT)
    except:
        return 1

    return 0


def readCSV(abpath_in):
    '''Read time series data in csv format

    Args:
        abpath_in (str): absolute path to csv file.
    Returns:
        data (ndarray): 2D array of floats, shape (n_years, 12).
        years (ndarray): 1d array of ints, years.
        missing (float): missing value in <data>.
        desc (str): description texts at the end of the csv data file.
    '''

    data = []
    years = []
    descr = []
    end_data = False
    with open(abpath_in, 'r') as fin:
        n = 0
        while True:
            line = fin.readline()
            if len(line) == 0:
                break
            if n == 0:
                n += 1
                continue  # skip 1st row
            line_s = line.split()
            if not end_data:
                if len(line_s) < 12:  # end of data
                    missing = float(line_s[0])
                    end_data = True
                else:
                    years.append(int(line_s[0]))
                    data.append([float(ii) for ii in line_s[1:]])
            else:
                descr.append(line.strip())
            n += 1

    years = np.array(years).astype('int')
    data = np.array(data).astype('float')
    desc = '. '.join(descr)

    return data, years, missing, desc


def createMonthlyTimeax(t1, t2):
    '''Create monthly time axis from begining to finishing time points (both included).

    Args:
        t1, t2 (str): time strings in format %Y-%m-%d
    Returns:
        result (list): list of datetime objs, time stamps of calendar months
            from t1 to t2 (both included).
    '''

    t1dt = datetime.strptime(t1, '%Y-%m-%d')
    t2dt = datetime.strptime(t2, '%Y-%m-%d')
    result = []
    cur = t1dt
    dt = 0
    while cur <= t2dt:
        result.append(cur)
        dt += 1
        cur = datetime.strptime('%04d-%02d-01' %
                                (t1dt.year + dt // 12, t1dt.month + dt % 12),
                                '%Y-%m-%d')

    return result


def convert2NC(abpath_in, outputdir, vid, isplot=True):
    '''Convert csv data to nc format, save to disk and optionally plot

    Args:
        abpath_in (str): absolute path to the csv file.
        outputdir (str): folder path to save outputs.
        vid (str): id of variable. E.g. 'pna', 'npo'.
    Keyword Args:
        isplot (bool): create time series plot or not.
    Returns:
        rec (int): 0 is run successfully.
    '''

    data, years, missing, descr = readCSV(abpath_in)
    data_ts = data.flatten()
    data_ts = np.where(data_ts == missing, np.nan, data_ts)
    timeax = createMonthlyTimeax('%04d-%02d-01' % (years[0], 1),
            '%04d-%02d-01' % (years[-1], 12))

    # --------Save------------------------------------
    file_out_name = '%s_s_m_%d-%d_noaa.nc' % (vid, years[0], years[-1])
    abpath_out = os.path.join(outputdir, file_out_name)
    print('\n### <convert2nc>: Saving output to:\n', abpath_out)

    with Dataset(abpath_out, 'w') as fout:

        #-----------------Create time axis-----------------
        # convert datetime to numbers
        timeax_val = date2num(timeax, 'days since 1900-01-01')
        fout.createDimension('time', None)
        axisvar = fout.createVariable('time', np.float32, ('time',), zlib=True)
        axisvar[:] = timeax_val
        axisvar.setncattr('name', 'time')
        axisvar.setncattr('units', 'days since 1900-01-01')

        #-------------------Add variable-------------------
        varout = fout.createVariable(vid, np.float32, ('time',), zlib=True)
        varout.setncattr('long_name', '%s index' % vid)
        varout.setncattr('standard_name', '%s index' % vid)
        varout.setncattr('title', descr)
        varout.setncattr('units', '')
        varout[:] = data_ts

    # -------------------Plot------------------------
    if isplot:
        import matplotlib.pyplot as plt
        figure, ax = plt.subplots(figsize=(10, 4))
        ts_pos = np.where(data_ts >= 0, data_ts, np.nan)
        ts_neg = np.where(data_ts < 0, data_ts, np.nan)
        ax.fill_between(timeax, ts_pos, y2=0, color='r')
        ax.fill_between(timeax, ts_neg, y2=0, color='b')
        ax.grid(True)
        ax.set_title(vid)

        # ----------------- Save plot------------
        plot_save_name = '%s_timeseries' % vid
        plot_save_name = os.path.join(outputdir, plot_save_name)
        print('\n# <download_climate_indices>: Save figure to', plot_save_name)
        figure.savefig(plot_save_name+'.png', dpi=100, bbox_inches='tight')
        figure.savefig(plot_save_name+'.pdf', dpi=100, bbox_inches='tight')

    return 0


if __name__ == '__main__':

    if not os.path.exists(OUTPUTDIR):
        os.mkdir(OUTPUTDIR)

    print('\n# <download_oisst>: Parsing page:', PAGE_URL)

    # Getting page HTML through request
    page = requests.get(PAGE_URL)
    # Parsing content using beautifulsoup
    soup = BeautifulSoup(page.content, 'html.parser')
    # Get the big table in the page
    table = soup.select('table')[0].find('tbody')

    # Parse table rows
    table_list = []
    for ii, rowii in enumerate(table.find_all('tr')):
        colsii = rowii.find_all('td')
        nameii = colsii[0].get_text().strip()
        urlii = colsii[0].find('a').get('href')
        descripii = colsii[1].get_text()
        desc_urlsii = [aii.get('href') for aii in colsii[1].find_all('a')]
        desc_urlsii = '\n'.join(
            [dii for dii in desc_urlsii if dii is not None])

        dataii = [ii, nameii, urlii, descripii, desc_urlsii]
        table_list.append(dataii)

    # --------------------Fetch data--------------------
    fail_list = []
    for rowii in table_list:

        try:
            rec = fetchData(rowii, OUTPUTDIR)
        except:
            rec = 1

        if rec != 0:
            fail_list.append(rowii[:2])

    if len(fail_list) == 0:
        print('\n# <download_climate_indices>: All done!')
    else:
        print('\n# <download_climate_indices>: Failed jobs:')
        for ii in fail_list:
            print(ii)

