WIP Dask Cluster

srggrs · srggrs · commit 86043a6006c5 · 2020-09-23T10:34:43.000+10:00
diff --git a/requirements/base.txt b/requirements/base.txt
@@ -1,7 +1,7 @@
 aplpy==2.0.3
 astropy==4.0
 astroquery==0.4
-cloudpickle==1.5.0
+cloudpickle==1.6.0
 dask[complete]==2.15.0
 dill==0.3.1.1
 distributed==2.26.0
diff --git a/vast_pipeline/pipeline/new_sources.py b/vast_pipeline/pipeline/new_sources.py
@@ -3,13 +3,15 @@
 import pandas as pd
 import numpy as np
 import dask.dataframe as dd
+import dask.bag as db
 
 from psutil import cpu_count
 from astropy import units as u
 from astropy.coordinates import SkyCoord
 from astropy.io import fits
 from astropy.wcs import WCS
 from astropy.wcs.utils import skycoord_to_pixel
+from typing import Dict
 
 from vast_pipeline.models import Image
 from vast_pipeline.utils.utils import StopWatch
@@ -18,96 +20,123 @@
 logger = logging.getLogger(__name__)
 
 
-def check_primary_image(row):
+def check_primary_image(row: pd.Series) -> bool:
     return row['primary'] in row['img_list']
 
 
-def get_image_rms_measurements(group):
-    """
-    Take the coordinates provided from the group
-    and measure the array value in the provided image.
-    """
-    image = group.iloc[0]['img_diff_rms_path']
-
+def extract_rms_data_from_img(image: str) -> Dict:
     with fits.open(image) as hdul:
-        wcs = WCS(hdul[0].header, naxis=2)
+            wcs = WCS(hdul[0].header, naxis=2)
+            try:
+                # ASKAP tile images
+                data = hdul[0].data[0, 0, :, :]
+            except Exception as e:
+                # ASKAP SWarp images
+                data = hdul[0].data
+
+    return {'data': data, 'wcs': wcs}
 
-        try:
-            # ASKAP tile images
-            data = hdul[0].data[0, 0, :, :]
-        except Exception as e:
-            # ASKAP SWarp images
-            data = hdul[0].data
 
+def get_coord_array(df: pd.DataFrame) -> SkyCoord:
     coords = SkyCoord(
-        group.wavg_ra, group.wavg_dec, unit=(u.deg, u.deg)
+        df['wavg_ra'].values,
+        df['wavg_dec'].values,
+        unit=(u.deg, u.deg)
     )
 
-    array_coords = wcs.world_to_array_index(coords)
+    return coords
+
+
+def finalise_rms_calcs(rms: Dict, coords: np.array,
+    df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Take the coordinates provided from the group
+    and measure the array value in the provided image.
+    """
+    array_coords = rms['wcs'].world_to_array_index(coords)
     array_coords = np.array([
         np.array(array_coords[0]),
         np.array(array_coords[1]),
     ])
 
     # check for pixel wrapping
     x_valid = np.logical_or(
-        array_coords[0] >= data.shape[0],
+        array_coords[0] >= rms['data'].shape[0],
         array_coords[0] < 0
     )
 
     y_valid = np.logical_or(
-        array_coords[1] >= data.shape[1],
+        array_coords[1] >= rms['data'].shape[1],
         array_coords[1] < 0
     )
 
-    valid = ~np.logical_or(
-        x_valid, y_valid
-    )
-
-    valid_indexes = group[valid].index.values
+    # calculated the mask for indexes
+    valid = ~np.logical_or(x_valid, y_valid)
 
-    rms_values = data[
+    # create the column data, not matched ones will be NaN.
+    rms_values = np.full(valid.shape, np.NaN)
+    rms_values[valid] = rms['data'][
         array_coords[0][valid],
         array_coords[1][valid]
-    ]
+    ].astype(np.float64) * 1.e3
 
-    # not matched ones will be NaN.
-    group.loc[
-        valid_indexes, 'img_diff_true_rms'
-    ] = rms_values.astype(np.float64) * 1.e3
+    # copy the df and create the rms column
+    df_out = df.copy()# dask doesn't like to modify inputs in place
+    df_out['img_diff_true_rms'] = rms_values
 
-    return group
+    return df_out
 
 
-def parallel_get_rms_measurements(df):
+def parallel_get_rms_measurements(df: dd.core.DataFrame) -> dd.core.DataFrame:
     """
     Wrapper function to use 'get_image_rms_measurements'
     in parallel with Dask.
     """
+    # Use the Dask bag backend to work on different image files
+    # calculate first the unique image_diff then create the bag
+    uniq_img_diff = (
+        df['img_diff_rms_path'].unique()
+        .compute()
+        .tolist()
+    )
+    nr_uniq_img = len(uniq_img_diff)
+    # map the extract function to the bag to get data from images
+    img_data_bags = (
+        db.from_sequence(uniq_img_diff, npartitions=nr_uniq_img)
+        .map(extract_rms_data_from_img)
+    )
 
-    out = df[[
-        'source', 'wavg_ra', 'wavg_dec',
-        'img_diff_rms_path'
-    ]]
-
-    col_dtype = {
-        'source': 'i',
-        'wavg_ra': 'f',
-        'wavg_dec': 'f',
-        'img_diff_rms_path': 'U',
-        'img_diff_true_rms': 'f',
-    }
-
-    n_cpu = cpu_count() - 1
-
-    out = (
-        dd.from_pandas(out, n_cpu)
-        .groupby('img_diff_rms_path')
-        .apply(
-            get_image_rms_measurements,
-            meta=col_dtype
-        ).compute(num_workers=n_cpu, scheduler='processes')
+    # generate bags with dataframes for each unique image_diff
+    cols = ['img_diff_rms_path', 'source', 'wavg_ra', 'wavg_dec']
+    df_bags = []
+    for elem in uniq_img_diff:
+        df_bags.append(df.loc[df['img_diff_rms_path'] == elem, cols])
+    df_bags = dd.compute(*df_bags)
+    df_bags = db.from_sequence(df_bags, npartitions=nr_uniq_img)
+
+    # map the get_coord_array and column selection function
+    arr_coords_bags = df_bags.map(get_coord_array)
+    col_sel_bags = df_bags.map(lambda onedf: onedf[['source']])
+
+    # combine the bags and apply final operations, this will create a list
+    # of pandas dataframes
+    out_bag = (
+        db.zip(img_data_bags, arr_coords_bags, col_sel_bags)
+        .map(lambda tup: finalise_rms_calcs(*tup))
+        # .to_delayed()
+        .persist()
     )
+    import ipdb; ipdb.set_trace()  # breakpoint 3b4d58cc //
+
+    # out = (
+    #     # dd.from_pandas(out, n_cpu)
+    #     df[['source', 'wavg_ra', 'wavg_dec', 'img_diff_rms_path']]
+    #     .groupby('img_diff_rms_path')
+    #     .apply(
+    #         get_image_rms_measurements,
+    #         meta=col_dtype
+    #     )
+    # )
 
     df = df.merge(
         out[['source', 'img_diff_true_rms']],
@@ -118,82 +147,79 @@ def parallel_get_rms_measurements(df):
     return df
 
 
-def new_sources(sources_df, missing_sources_df, min_sigma, p_run):
+def new_sources(sources_df: dd.core.DataFrame,
+    missing_sources_df: dd.core.DataFrame, min_sigma: float, p_run: Run
+    ) -> dd.core.DataFrame:
     """
     Process the new sources detected to see if they are
     valid.
     """
 
-    timer = StopWatch()
+    # timer = StopWatch()
 
-    logger.info("Starting new source analysis.")
+    logger.info('Starting new source analysis.')
 
     cols = [
-        'id', 'name', 'noise_path', 'datetime',
+        'name', 'noise_path', 'datetime',
         'rms_median', 'rms_min', 'rms_max'
     ]
 
     images_df = pd.DataFrame(list(
-        Image.objects.filter(
-            run=p_run
-        ).values(*tuple(cols))
+        Image.objects.filter(run=p_run)
+        .values(*tuple(cols))
     )).set_index('name')
 
     # Get rid of sources that are not 'new', i.e. sources which the
     # first sky region image is not in the image list
 
     missing_sources_df['primary'] = missing_sources_df[
         'skyreg_img_list'
-    ].apply(lambda x: x[0])
+    ].apply(lambda x: x[0], meta=str)
 
     missing_sources_df['detection'] = missing_sources_df[
         'img_list'
-    ].apply(lambda x: x[0])
+    ].apply(lambda x: x[0], meta=str)
 
     missing_sources_df['in_primary'] = missing_sources_df[
         ['primary', 'img_list']
-    ].apply(
-        check_primary_image,
-        axis=1
-    )
+    ].apply(check_primary_image, axis=1, meta=bool)
 
     new_sources_df = missing_sources_df[
         missing_sources_df['in_primary'] == False
-    ].drop(
-        columns=['in_primary']
-    )
+    ].drop(columns=['in_primary'])
+    del missing_sources_df
 
     # Check if the previous sources would have actually been seen
     # i.e. are the previous images sensitive enough
 
-    # save the index before exploding
-    new_sources_df = new_sources_df.reset_index()
-
-    # Explode now to avoid two loops below
-    new_sources_df = new_sources_df.explode('img_diff')
-
+    # save the index and explode now to avoid two loops below
     # Merge the respective image information to the df
-    new_sources_df = new_sources_df.merge(
-        images_df[['datetime']],
-        left_on='detection',
-        right_on='name',
-        how='left'
-    ).rename(columns={'datetime':'detection_time'})
-
-    new_sources_df = new_sources_df.merge(
-        images_df[[
-            'datetime', 'rms_min', 'rms_median',
-            'noise_path'
-        ]],
-        left_on='img_diff',
-        right_on='name',
-        how='left'
-    ).rename(columns={
-        'datetime':'img_diff_time',
-        'rms_min': 'img_diff_rms_min',
-        'rms_median': 'img_diff_rms_median',
-        'noise_path': 'img_diff_rms_path'
-    })
+    new_sources_df = (
+        new_sources_df.reset_index()
+        .explode('img_diff')
+        .merge(
+            images_df[['datetime']],
+            left_on='detection',
+            right_on='name',
+            how='left'
+        )
+        .rename(columns={'datetime': 'detection_time'})
+    # )
+    # new_sources_df = (
+        # new_sources_df.merge(
+        .merge(
+            images_df,
+            left_on='img_diff',
+            right_on='name',
+            how='left'
+        )
+        .rename(columns={
+            'datetime': 'img_diff_time',
+            'rms_min': 'img_diff_rms_min',
+            'rms_median': 'img_diff_rms_median',
+            'noise_path': 'img_diff_rms_path'
+        })
+    )
 
     # Select only those images that come before the detection image
     # in time.
@@ -202,11 +228,15 @@ def new_sources(sources_df, missing_sources_df, min_sigma, p_run):
     ]
 
     # merge the detection fluxes in
-    new_sources_df = pd.merge(
-        new_sources_df, sources_df[['source', 'image', 'flux_peak']],
-        left_on=['source', 'detection'], right_on=['source', 'image'],
-        how='left'
-    ).drop(columns=['image'])
+    new_sources_df = (
+        new_sources_df.merge(
+            sources_df,
+            left_on=['source', 'detection'],
+            right_on=['source', 'image'],
+            how='left'
+        )
+        .drop(columns=['image'])
+    )
 
     # calculate the sigma of the source if it was placed in the
     # minimum rms region of the previous images
@@ -236,9 +266,7 @@ def new_sources(sources_df, missing_sources_df, min_sigma, p_run):
 
     # measure the actual rms in the previous images at
     # the source location.
-    new_sources_df = parallel_get_rms_measurements(
-        new_sources_df
-    )
+    new_sources_df = parallel_get_rms_measurements(new_sources_df)
 
     # this removes those that are out of range
     new_sources_df['img_diff_true_rms'] = new_sources_df['img_diff_true_rms'].fillna(0.)
@@ -262,8 +290,8 @@ def new_sources(sources_df, missing_sources_df, min_sigma, p_run):
         columns={'true_sigma':'new_high_sigma'}
     )
 
-    logger.info(
-        'Total new source analysis time: %.2f seconds', timer.reset_init()
-    )
+    # logger.info(
+    #     'Total new source analysis time: %.2f seconds', timer.reset_init()
+    # )
 
-    return new_sources_df.set_index('source')
+    return new_sources_df.set_index('source').persist()