Source code for suncasa.eovsa.eovsa_flarelist

##ipython eovsa_flarelist.py
##=============
import os
import sys
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from scipy.signal import find_peaks
from datetime import datetime, timedelta
from astropy.time import Time
from astropy.io import fits
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

# Global settings and initializations
# EO_WIKI_URL = "http://www.ovsa.njit.edu/wiki/index.php/Recent_Flare_List_(2021-)"

[docs]
EO_WIKI_URLs = [
    "https://www.ovsa.njit.edu/wiki/index.php/2017",
    "https://www.ovsa.njit.edu/wiki/index.php/2018",
    "https://www.ovsa.njit.edu/wiki/index.php/2019",
    "https://www.ovsa.njit.edu/wiki/index.php/2020",
    "https://www.ovsa.njit.edu/wiki/index.php/2021",
    "https://www.ovsa.njit.edu/wiki/index.php/2022",
    "https://www.ovsa.njit.edu/wiki/index.php/2023",
    "https://www.ovsa.njit.edu/wiki/index.php/2024",
    "https://www.ovsa.njit.edu/wiki/index.php/2025",
    "https://www.ovsa.njit.edu/wiki/index.php/2026"
]




[docs]
def fetch_flare_data_from_wiki(eo_wiki_urls, given_date_strp, outcsvfile):
    """
    Fetches flare data from the given wiki URL and saves it to a CSV file in the specified directory.

    Parameters:
    - eo_wiki_url: URL of the EO wiki page to fetch data from.
    - given_date_strp: A tuple of datetime objects specifying the start and end dates to filter the data.
    - outcsvfile: path to the resulting CSV file.

    Returns:
    - None
    """
    date_data = []
    time_ut_data = []
    flare_class_data = []
    flare_ID_data = []
    depec_file = []

    for eo_wiki_url in eo_wiki_urls:
        response = requests.get(eo_wiki_url)

        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content of the page using BeautifulSoup
            soup = BeautifulSoup(response.text, "html.parser")
            tables = soup.find_all("table", {"class": "wikitable"})

            for table in tables:
                for row in table.find_all("tr"):
                    cells = row.find_all("td")

                    if len(cells) >= 3:
                        date = cells[0].text.strip()
                        time_ut = cells[1].text.strip()
                        flare_class = cells[2].text.strip()
                        datetime_strp = datetime.strptime(date + ' ' + time_ut, '%Y-%m-%d %H:%M')

                        if given_date_strp[0] <= datetime_strp <= given_date_strp[1]:
                            # get Date, Time (UT) and Flare Class
                            date_data.append(date)
                            time_ut_data.append(f"{time_ut}:00")
                            flare_class_data.append(flare_class)
                            # get Flare_ID
                            eotime = f"{date.replace('/', '-')}T{time_ut}:00"
                            flare_ID_data.append(eotime.replace('-', '').replace('T', '').replace(':', ''))
                            print("Fetched: ", eotime)

                            depec_file_tmp = ''
                            for cell in cells:
                                link_cell = cell.find('a', class_='external text', href=True, rel='nofollow')
                                if link_cell:
                                    url = link_cell['href']
                                    if url.endswith(".dat") or url.endswith(".fits"):
                                        depec_file_tmp = url.split('/')[-1]
                                        break
                            # get depec_file
                            depec_file.append(str(depec_file_tmp))
        else:
            print("Failed to retrieve the webpage. Status code:", response.status_code)

    # reformate the date and time above
    data = {
        "ID": np.arange(len(date_data)) + 1,
        "Flare_ID": flare_ID_data,
        "Date": date_data,
        "Time (UT)": time_ut_data,
        "Flare Class": flare_class_data,
        "depec_file": depec_file
    }

    df = pd.DataFrame(data)
    df.to_csv(outcsvfile, index=False)
    print(f"Date and Time (UT) data saved to {outcsvfile}")




[docs]
def rd_datfile(file):
    ''' Read EOVSA binary spectrogram file and return a dictionary with times
        in Julian Date, frequencies in GHz, and cross-power data in sfu.

        Return Keys:
          'time'     Numpy array of nt times in JD format
          'fghz'     Numpy array of nf frequencies in GHz
          'data'     Numpy array of size [nf, nt] containing cross-power data

        Returns empty dictionary ({}) if file size is not compatible with inferred dimensions
    '''
    import struct
    import numpy as np
    def dims(file):
        # Determine time and frequency dimensions (assumes the file has fewer than 10000 times)
        f = open(file, 'rb')
        tmp = f.read(83608)  # max 10000 times and 451 frequencies
        f.close()
        nbytes = len(tmp)
        tdat = np.array(struct.unpack(str(int(nbytes / 8)) + 'd', tmp[:nbytes]))
        nt = np.where(tdat < 2400000.)[0]
        nf = np.where(np.logical_or(tdat[nt[0]:] > 18, tdat[nt[0]:] < 1))[0]
        return nt[0], nf[0]

    nt, nf = dims(file)
    f = open(file, 'rb')
    tmp = f.read(nt * 8)
    times = struct.unpack(str(nt) + 'd', tmp)
    tmp = f.read(nf * 8)
    fghz = struct.unpack(str(nf) + 'd', tmp)
    tmp = f.read()
    f.close()
    if len(tmp) != nf * nt * 4:
        print('File size is incorrect for nt=', nt, 'and nf=', nf)
        return {}
    data = np.array(struct.unpack(str(nt * nf) + 'f', tmp)).reshape(nf, nt)
    return {'time': times, 'fghz': fghz, 'data': data}




[docs]
def moving_average(data, window_size):
    # Create a convolution kernel for the moving average
    kernel = np.ones(window_size) / window_size
    return np.convolve(data, kernel, mode='valid')




[docs]
def get_eoflarelist(timerange=None, work_dir='./', file_out='EOVSA_flare_list_from_wiki_sub.csv'):
    '''
    Get eovsa flarelist from given timerange.
    On pipeline or ovsa server.

    Parameters:
    timerange:

    Example:
    final_csv_file = get_eoflarelist(timerange=["2024-01-01 20:00:00", "2024-01-01 21:00:00"])

    '''
    # Convert timerange strings to datetime objects for internal use

    timerange_strp = [datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S') for date_str in timerange]
    print(f"Fetching data for time range {timerange[0]} to {timerange[1]}")

    ##=============Step 1: Fetching data from wiki=============
    print("##=============Step 1: capture the radio peak times from flare list wiki webpage")

    init_csv = os.path.join(work_dir, "get_time_from_wiki_given_date.csv")
    fetch_flare_data_from_wiki(EO_WIKI_URLs, timerange_strp, init_csv)

    ##=============Step 2: read the spectrum data=============
    print("##=============Step 2: read the spectrum data")

    spec_data_dir = "/common/webplots/events/"  # YYYY/
    print("Spec data in ", spec_data_dir)

    df = pd.read_csv(init_csv)
    flare_id = df['Flare_ID']
    depec_file = df['depec_file']

    files_wiki = [spec_data_dir + str(flare_id[i])[0:4] + "/" + str(file_name) for i, file_name in
                  enumerate(depec_file)]

    ##=============
    tpk_spec_wiki = [] 
    tst_mad_spec_wiki, ted_mad_spec_wiki = [], []
    tst_thrd_spec_wiki, ted_thrd_spec_wiki = [], []
    depec_file = []

    ##=============
    for ww, file_wiki in enumerate(files_wiki):  # len(files_wiki)

        filename1 = os.path.basename(file_wiki)
        depec_file.append(filename1)
        print("Reading spec data: ", filename1)

        try:
            if filename1.split('.')[-1] == 'dat':
                filename = filename1.split('.dat')[0]
                data1 = rd_datfile(file_wiki)
                spec = np.array(data1['data'])
                fghz = np.array(data1['fghz'])
                time1 = data1['time']
            if filename1.split('.')[-1] == 'fits':
                filename = filename1.split('.fits')[0]
                eospecfits = fits.open(file_wiki)
                spec = eospecfits[0].data  # [freq, time]
                fghz = np.array(eospecfits[1].data['FGHZ'])  # in GHz
                time1 = np.array(eospecfits[2].data['TIME'])  # in jd format

        except Exception as e:
            temp = datetime.strptime(str(flare_id[ww]), "%Y%m%d%H%M%S")
            temp_st = (temp - timedelta(minutes=2)).strftime("%Y-%m-%d %H:%M:%S")
            temp_ed = (temp + timedelta(minutes=2)).strftime("%Y-%m-%d %H:%M:%S")

            tpk_spec_wiki.append(temp.strftime("%Y-%m-%d %H:%M:%S"))
            tst_mad_spec_wiki.append(temp_st)
            ted_mad_spec_wiki.append(temp_ed)
            tst_thrd_spec_wiki.append(temp_st)
            ted_thrd_spec_wiki.append(temp_ed)

            print('no data found for:', file_wiki)
            print('st/ed time will be tpk \u00B1 2mins: ', temp_st, temp_ed)
            continue

        time_spec = Time(time1, format='jd')
        time_str = time_spec.isot

        ##=============try MAD method
        tpk_tot_ind = []

        tst_tot_ind = []
        ted_tot_ind = []
        mad_threshd = []

        tst_tot_ind_thrd = []
        ted_tot_ind_thrd = []

        spec = np.nan_to_num(spec, nan=0.0)
        spec[spec < 0] = 0.01
        spec_median = np.median(spec, axis=1, keepdims=True)
        spec_abs_deviations = np.abs(spec - spec_median)
        spec_mad = np.median(spec_abs_deviations, axis=1, keepdims=True)
        mad_threshd = 3.0 * spec_mad

        outliers = spec_abs_deviations > mad_threshd

        for ff in range(len(fghz)):
            good_channel = False
            flux_array = spec[ff, :]

            ##=============try MAD method
            outlier_ind = np.where(outliers[ff])[0]
            if len(outlier_ind) > 0:
                tst_tot_ind.append(outlier_ind[0])
                ted_tot_ind.append(outlier_ind[-1])

            ##=============try to set threshold
            window_size = 10
            y = moving_average(flux_array, window_size) + 0.001  ##flux_array
            peaks, _ = find_peaks(y, height=1.)
            noise_thrd_st = np.mean(y[0:5])
            noise_thrd_ed = np.mean(y[-5:])

            if noise_thrd_st == 0:
                noise_thrd_st = 0.005 * np.max(y)
            if noise_thrd_ed == 0:
                noise_thrd_ed = 0.01 * np.max(y)
            if np.max(y) / noise_thrd_ed > 100:
                noise_thrd_ed = 0.02 * np.max(y)
                # noise_thrd_st = np.max([np.mean(y[0:10]),0.01*np.max(y)])
            # noise_thrd_ed = np.max([np.mean(y[-10:])*np.median(y[peaks]),0.01*np.max(y)])
            # if ff == 5:
            #     print(np.max(y), np.median(y[peaks]), np.mean(y[0:10]), np.mean(y[-10:]))

            for ind in range(len(y) - 5):
                if y[ind] < y[ind + 1] < y[ind + 2] < y[ind + 3] < y[ind + 4] < y[ind + 5]:
                    if y[ind + 5] >= 2 * y[ind]:
                        if all(y[i] > noise_thrd_st for i in range(ind, ind + 6)):
                            tst_tot_ind_thrd.append(ind)
                            break
            ind_tmp = np.argmax(flux_array) - 30
            for ind in range(len(y) - 5):
                if y[ind] > y[ind + 1] > y[ind + 2] > y[ind + 3] > y[ind + 4] > y[ind + 5]:
                    if y[ind + 3] <= 2 * y[ind]:
                        if all(abs(y[i]) > noise_thrd_ed for i in range(ind, ind + 6)):
                            ind_tmp = ind + 5
            ted_tot_ind_thrd.append(ind_tmp)

            ##=============tpeak
            tpk_tot_ind.append(np.argmax(flux_array))

        time_st_mad = time_spec[int(np.round(np.median(np.array(tst_tot_ind))))]
        time_ed_mad = time_spec[int(np.round(np.median(np.array(ted_tot_ind))))]

        time_st_thrd = time_spec[int(np.round(np.median(np.array(tst_tot_ind_thrd))))]
        time_ed_thrd = time_spec[int(np.round(np.median(np.array(ted_tot_ind_thrd))))]

        time_st = time_st_thrd
        time_ed = time_ed_thrd

        time_pk = time_spec[int(np.median(np.array(tpk_tot_ind)))]

        time_pk_obj = Time(time_pk, format='jd')

        tpk_spec_wiki.append(time_pk_obj.strftime('%Y-%m-%d %H:%M:%S'))

        tst_mad_spec_wiki.append(Time(time_st_mad, format='jd').strftime('%Y-%m-%d %H:%M:%S'))
        ted_mad_spec_wiki.append(Time(time_ed_mad, format='jd').strftime('%Y-%m-%d %H:%M:%S'))

        tst_thrd_spec_wiki.append(Time(time_st_thrd, format='jd').strftime('%Y-%m-%d %H:%M:%S'))
        ted_thrd_spec_wiki.append(Time(time_ed_thrd, format='jd').strftime('%Y-%m-%d %H:%M:%S'))


    #apply the thrd method
    EO_tpeak = tpk_spec_wiki
    EO_tstart_thrd = tst_thrd_spec_wiki
    EO_tend_thrd = ted_thrd_spec_wiki

    data_csv = {
        "ID": np.arange(len(flare_id)) + 1,
        "Flare_ID": flare_id,
        "Date": df['Date'],
        "Time (UT)": df['Time (UT)'],
        "Flare Class": df['Flare Class'],
        'EO_tstart': EO_tstart_thrd,
        'EO_tpeak': EO_tpeak,
        'EO_tend': EO_tend_thrd,
        'depec_file': df['depec_file']
    }

    df = pd.DataFrame(data_csv)
    final_csv_file = os.path.join(work_dir, file_out)
    df.to_csv(final_csv_file, index=False)
    print(f"Success: {len(flare_id)} flares saved to {file_out}")

    return final_csv_file




[docs]
def run_eoflare_pipeline(flarelist_csv=None, to_web=True):
    ##=========================reading flarelist from .csv=========================
    import pandas as pd
    import os
    import shutil

    df = pd.read_csv(flarelist_csv)
    flare_id_tot = df['Flare_ID']
    EO_tstart_tot = df['EO_tstart']
    EO_tend_tot = df['EO_tend']

    for i,j in enumerate(flare_id_tot):
        print(f"Flare_ID, st, ed time: {flare_id_tot[i]}, {EO_tstart_tot[i]}, {EO_tend_tot[i]}")

    ##=========================running flare pipeline=========================
    ans = 'Y'
    # ans = input('Do you want to continue to run flare pipeline for Flare_IDs above? (say no if you want to stop) [y/n]?')

    root_dir = os.getcwd()

    if ans.upper() == 'Y':
        from suncasa.eovsa import eovsa_flare_pipeline
        from eovsapy.util import Time

        for ff, flare_id in enumerate(flare_id_tot):#flare_id_tot, len(flare_id_tot)
            os.chdir(root_dir)
            try:
                EO_tstart = EO_tstart_tot[ff]
                EO_tend = EO_tend_tot[ff]
                flare_id = str(flare_id)
                print(f"To run flare pipeline for Flare_ID: {flare_id}")

                work_dir = os.path.join(root_dir, flare_id)
                os.makedirs(work_dir, exist_ok=True)
                os.chdir(work_dir)

                trange_str = [EO_tstart, EO_tend]
                trange = Time(trange_str)
                fp = eovsa_flare_pipeline.FlareSelfCalib(vis=trange)
                fp.imaging_start = trange_str[0]
                fp.imaging_end = trange_str[1]
                fp.slfcal_pipeline(doselfcal=True, doimaging=True)
                if to_web:
                    fp.rename_move_files(flare_id=flare_id, fitsdir_web_tp='/data1/eovsa/fits/flares/', 
                                        movdir_web_tp='/common/webplots/SynopticImg/eovsamedia/eovsa-browser/',
                                        msdir_web_tp='/data1/eovsa/fits/flares/', 
                                        dorename_fits=True, domove_fits=True, 
                                        dorename_mov=True, domove_mov=True, domove_ms=True, dormworkdir=True, docopy=True)
                print(f"Success for Flare_ID: {flare_id}")

            except Exception as e:
                print(f"Error at ff = {ff} for Flare_id = {flare_id}: {e}")
                continue
        os.chdir(root_dir)
        print('Done!')


# if __name__ == "__main__":
#     main()