Source code for vast.voidfinder.preprocessing


import numpy as np

from astropy.table import Table
from astropy.io import ascii, fits

from vast.voidfinder.distance import z_to_comoving_dist

import time

import h5py

from vast.voidfinder.constants import c #speed of light

import os



def load_data_to_Table(input_filepath):
    """
    Load a .dat, .txt, .fits, or .h5 file representing a table of numerical 
    values.  For HDF5 (.h5) files, assumes the "columns" are stored as 
    individual datasets on the root h5py.File object.
    
    If the input filepath ends with a .h5 extension, this will attempt to read 
    it in as the HDF5 format, otherwise for all other extensions it will attempt 
    the normal Table.read()
    
    Parameters
    ==========
    
    input_filepath : str
        path to the input .dat, .txt, .fits, or .h5 file
        
    Returns
    =======
    
    astropy.table.Table
    """
        
    if input_filepath.endswith(".h5"):
        
        infile = h5py.File(input_filepath, 'r')
        
        data_table = Table()
        
        for col in list(infile.keys()):
            
            data_table[col] = infile[col][()]
            
        infile.close()

    elif input_filepath.endswith('.fits') or input_filepath.endswith('.fit'):

        hdu = fits.open(input_filepath)

        data_table = Table(hdu[1].data)

        hdu.close()
        
    else:
        
        if os.path.getsize(input_filepath) < 5e9:
            
            data_table = Table.read(input_filepath, format='ascii.commented_header')

        else:
        
            # Import header line
            data_table_fobj = open(input_filepath, 'r')

            header_line = data_table_fobj.readline()

            data_table_fobj.close()

            # Parse header line
            col_names = header_line.split(' ')

            # Read in the data in 100 Mb chunks
            data_table = ascii.read(input_filepath, 
                                    format='no_header', 
                                    names=col_names[1:], 
                                    guess=False, 
                                    fast_reader={'chunk_size': 100*1000000})
        
    return data_table
    
    
    


[docs]def file_preprocess(galaxies_filename, in_directory, out_directory, mag_cut=True, rm_isolated=True, dist_metric='comoving', min_z=None, max_z=None, Omega_M=0.3, h=1.0, verbose=0): ''' Set up output file names, calculate distances, etc. PARAMETERS ========== galaxies_filename : string File name of galaxy catalog. Should be readable by astropy.table.Table.read as a ascii.commented_header file. Required columns include 'ra', 'dec', 'z', and absolute magnitude (either 'rabsmag' or 'magnitude'. in_directory : string Directory path for input files out_directory : string Directory path for output files mag_cut : boolean Determines whether or not to implement a magnitude cut on the galaxy survey. Default is True (remove all galaxies fainter than Mr = -20). rm_isolated : boolean Determines whether or not to remove isolated galaxies (defined as those with the distance to their third nearest neighbor greater than the sum of the average third-nearest-neighbor distance and 1.5 times the standard deviation of the third-nearest-neighbor distances). dist_metric : string Description of which distance metric to use. Options should include 'comoving' (default) and 'redshift'. min_z, max_z : float Minimum and maximum redshift range for the survey mask. Default values are None (determined from galaxy extent). Omega_M : float Value of the matter density of the given cosmology. Default is 0.3. h : float Value of the Hubble constant. Default is 1 (so all distances will be in units of h^-1). RETURNS ======= galaxy_data_table : astropy table Table of all galaxies in catalog. dist_limits : numpy array of shape (2,) Minimum and maximum distances to use for void search. Units are Mpc/h, in either comoving or redshift coordinates (depending on dist_metric). out1_filename : string File name of maximal sphere output file. out2_filename : string File name of all void holes ''' ############################################################################ # Build output file names #--------------------------------------------------------------------------- if mag_cut and rm_isolated: out1_suffix = '_' + dist_metric + '_maximal.txt' out2_suffix = '_' + dist_metric + '_holes.txt' elif rm_isolated: out1_suffix = '_' + dist_metric + '_maximal_noMagCut.txt' out2_suffix = '_' + dist_metric + '_holes_noMagCut.txt' elif mag_cut: out1_suffix = '_' + dist_metric + '_maximal_keepIsolated.txt' out2_suffix = '_' + dist_metric + '_holes_keepIsolated.txt' else: out1_suffix = '_' + dist_metric + '_maximal_noFiltering.txt' out2_suffix = '_' + dist_metric + 'holes_noFiltering.txt' out1_filename = out_directory + galaxies_filename[:-4] + out1_suffix # List of maximal spheres of each void region: x, y, z, radius, distance, ra, dec out2_filename = out_directory + galaxies_filename[:-4] + out2_suffix # List of holes for all void regions: x, y, z, radius, flag (to which void it belongs) #out3_filename = out_directory + 'out3_vollim_dr7.txt' # List of void region sizes: radius, effective radius, evolume, x, y, z, deltap, nfield, vol_maxhole #voidgals_filename = out_directory + 'vollim_voidgals_dr7.txt' # List of the void galaxies: x, y, z, void region ############################################################################ ############################################################################ # Open galaxy catalog #--------------------------------------------------------------------------- in_filename = in_directory + galaxies_filename print("Loading galaxy data table at: ", in_filename, flush=True) load_start_time = time.time() galaxy_data_table = load_data_to_Table(in_filename) print("Galaxy data table load time: ", time.time() - load_start_time, flush=True) ############################################################################ ############################################################################ # Rename columns #--------------------------------------------------------------------------- if mag_cut and ('rabsmag' not in galaxy_data_table.columns): galaxy_data_table['magnitude'].name = 'rabsmag' if 'redshift' not in galaxy_data_table.columns: galaxy_data_table['z'].name = 'redshift' ############################################################################ ############################################################################ # Determine min and max redshifts if not supplied by user #--------------------------------------------------------------------------- # Minimum distance if min_z is None: min_z = min(galaxy_data_table['redshift']) # Maximum distance if max_z is None: max_z = max(galaxy_data_table['redshift']) if dist_metric == 'comoving': # Convert redshift to comoving distance dist_limits = z_to_comoving_dist(np.array([min_z, max_z], dtype=np.float32), Omega_M, h) else: H0 = 100*h dist_limits = c*np.array([min_z, max_z])/H0 ############################################################################ ############################################################################ # Calculate comoving distance #--------------------------------------------------------------------------- if dist_metric == 'comoving': #and 'Rgal' not in galaxy_data_table.columns: print("Calculating Rgal data table column", flush=True) calc_start_time = time.time() galaxy_data_table['Rgal'] = z_to_comoving_dist(galaxy_data_table['redshift'].data.astype(np.float32), Omega_M, h) print("Finished Rgal calculation time: ", time.time() - calc_start_time, flush=True) ############################################################################ return galaxy_data_table, dist_limits, out1_filename, out2_filename