Commit c17db61e authored by Yannick Roehlly's avatar Yannick Roehlly

First version of the pdf_analysis module

Maybe we should change the name.
parent f23e702a
# -*- coding: utf-8 -*-
# Copyright (C) 2013 Centre de données Astrophysiques de Marseille
# Copyright (C) 2013-2014 Yannick Roehlly
# Licensed under the CeCILL-v2 licence - see Licence_CeCILL_V2-en.txt
# Author: Yannick Roehlly
"""
Probability Density Function analysis module
============================================
This module builds the probability density functions (PDF) of the SED
parameters to compute their moments.
The models corresponding to all possible combinations of parameters are
computed and their fluxes in the same filters as the observations are
integrated. These fluxes are compared to the observed ones to compute the
χ² value of the fitting. This χ² give a probability that is associated with
the model values for the parameters.
At the end, for each parameter, the probability-weighted mean and standard
deviation are computed and the best fitting model (the one with the least
reduced χ²) is given for each observation.
"""
import os
import numpy as np
from numpy import newaxis
from collections import OrderedDict
from datetime import datetime
from progressbar import ProgressBar
from astropy.table import Table, Column
from ...utils import read_table
from .. import AnalysisModule, complete_obs_table
from ...creation_modules import get_module as get_creation_module
from .utils import gen_compute_fluxes_at_redshift
from ...warehouse import SedWarehouse
from ...data import Database
# Tolerance threshold under which any flux or error is considered as 0.
TOLERANCE = 1.e-12
# Probability threshold: models with a lower probability are excluded from
# the moments computation.
MIN_PROBABILITY = 1.e-20
# Name of the file containing the analysis results
RESULT_FILE = "analysis_results.fits"
# Name of the file containing the best models information
BEST_MODEL_FILE = "best_models.fits"
# Directory where the output files are stored
OUT_DIR = "out/"
# Wavelength limits (restframe) when plotting the best SED.
PLOT_L_MIN = 91
PLOT_L_MAX = 1e6
class PdfAnalysis(AnalysisModule):
"""PDF analysis module"""
parameter_list = OrderedDict([
("analysed_variables", (
"array of strings",
"List of the variables (in the SEDs info dictionaries) for which "
"the statistical analysis will be done.",
["sfr", "average_sfr"]
)),
("use_observation_redshift", (
"boolean",
"If true, the redshift of each observation will be taken from the "
"redshift column in the observation table (default) and you must "
"not use a redshifting module. Set to false if you want to use a "
"redshifting module to test various redshifts.",
True
)),
("save_best_sed", (
"boolean",
"If true, save the best SED for each observation to a file.",
False
)),
("plot_best_sed", (
"boolean",
"If true, for each observation save a plot of the best SED "
"and the observed fluxes.",
False
)),
("plot_chi2_distribution", (
"boolean",
"If true, for each observation and each analysed variable "
"plot the value vs reduced chi-square distribution.",
False
)),
("save_pdf", (
"boolean",
"If true, for each observation and each analysed variable "
"save the probability density function.",
False
)),
("plot_pdf", (
"boolean",
"If true, for each observation and each analysed variable "
"plot the probability density function.",
False
)),
("pdf_max_bin_number", (
"integer",
"Maximum number of bins used to compute the probability density "
"function. This is only used when saving or printing the PDF. "
"If there are less values, the probability is given for each "
"one.",
50
)),
("storage_type", (
"string",
"Type of storage used to cache the generate SED.",
"memory"
))
])
def process(self, data_file, column_list, creation_modules,
creation_modules_params, parameters):
"""Process with the psum analysis.
The analysis is done in two nested loops: over each observation and
over each theoretical SEDs. We first loop over the SEDs to limit the
number of time the SEDs are created.
Parameters
----------
data_file: string
Name of the file containing the observations to fit.
column_list: list of strings
Name of the columns from the data file to use for the analysis.
creation_modules: list of strings
List of the module names (in the right order) to use for creating
the SEDs.
creation_modules_params: list of dictionaries
List of the parameter dictionaries for each module.
parameters: dictionary
Dictionary containing the parameters.
"""
# Rename the output directory if it exists
if os.path.exists(OUT_DIR):
new_name = datetime.now().strftime("%Y%m%d%H%M") + "_" + OUT_DIR
os.rename(OUT_DIR, new_name)
print("The existing {} directory was renamed to {}".format(
OUT_DIR,
new_name
))
os.mkdir(OUT_DIR)
# Get the parameters
analysed_variables = parameters["analysed_variables"]
use_observation_redshift = (parameters["use_observation_redshift"]
.lower() == "true")
save_best_sed = (parameters["save_best_sed"].lower() == "true")
plot_best_sed = (parameters["plot_best_sed"].lower() == "true")
plot_chi2_distribution = (
parameters["plot_chi2_distribution"].lower() == "true")
save_pdf = (parameters["save_pdf"].lower() == "true")
plot_pdf = (parameters["plot_pdf"].lower() == "true")
pdf_max_bin_number = int(parameters["pdf_max_bin_number"])
# Get the needed filters in the pcigale database. We use an ordered
# dictionary because we need the keys to always be returned in the
# same order.
with Database() as base:
filters = OrderedDict([(name, base.get_filter(name))
for name in column_list
if not name.endswith('_err')])
# If needed, get the redshifting and IGM attenuation modules (we set
# redshift=0 but the redshift value is adapted later).
redshifting_module = None
igm_module = None
if use_observation_redshift:
redshifting_module = get_creation_module("redshifting", redshift=0)
igm_module = get_creation_module("igm_attenuation")
# Read the observation table and complete it by adding error where
# none is provided and by adding the systematic deviation.
obs_table = complete_obs_table(
read_table(data_file),
column_list,
filters,
TOLERANCE
)
##################################################################
# Model computation #
##################################################################
print("Computing the models fluxes...")
# First, we compute for all the possible theoretical models (one for
# each parameter set in sed_module_parameters) the fluxes in all the
# filters at the redshift of each observed galaxy. These fluxes are
# stored in:
# model_fluxes:
# - axis 0: model index
# - axis 1: observation index
# - axis 2: filter index
# We use a numpy masked array to mask the fluxes of models that would
# be older than the age of the Universe at the considered redshift
# The values for the analysed variables are stored in:
# model_variables:
# - axis 0: the model index in sed_module_params
# - axis 1: the variable index in analysed_variables
model_fluxes = np.ma.zeros((len(creation_modules_params),
len(obs_table),
len(filters)))
model_variables = np.ma.zeros((len(creation_modules_params),
len(analysed_variables)))
# We keep the information (i.e. the content of the sed.info
# dictionary) for each model.
model_info = []
progress_bar = ProgressBar(maxval=len(creation_modules_params)).start()
# The SED warehouse is used to retrieve SED corresponding to some
# modules and parameters.
with SedWarehouse(cache_type=parameters["storage_type"]) as \
sed_warehouse:
for model_index, parameters in enumerate(creation_modules_params):
sed = sed_warehouse.get_sed(creation_modules, parameters)
# Cached function to compute the SED fluxes at a redshift
gen_fluxes = gen_compute_fluxes_at_redshift(
sed, filters.values(), redshifting_module, igm_module)
for obs_index, redshift in enumerate(obs_table["redshift"]):
model_fluxes[model_index, obs_index] = gen_fluxes(redshift)
model_variables[model_index] = np.array(
[sed.info[name] for name in analysed_variables]
)
model_info.append(sed.info.values())
progress_bar.update(model_index + 1)
# Mask the invalid fluxes
model_fluxes = np.ma.masked_less(model_fluxes, -99)
progress_bar.finish()
##################################################################
# Observations to models comparison #
##################################################################
print("Comparing the observations to the models...")
# To accelerate the computations, we put observations fluxes and
# errors in multi-dimensional numpy array:
# - axis 0: the observation index
# - axis 1: the filter index
obs_fluxes = np.array([
obs_table[name] for name in filters]
).transpose()
obs_errors = np.array([
obs_table[name + "_err"] for name in filters]
).transpose()
# Some observations may not have flux value in some filters, in that
# case the user is asked to put -9999 as value. We mask these values.
# Note, we must mask obs_fluxes after obs_errors.
obs_errors = np.ma.masked_where(obs_fluxes < -9990., obs_errors)
obs_fluxes = np.ma.masked_less(obs_fluxes, -9990.)
# Normalisation factor to be applied to a model fluxes to best fit an
# observation fluxes. Normalised flux of the models. χ² and
# likelihood of the fitting. Reduced χ² (divided by the number of
# filters to do the fit).
# axis 0: model index
# axis 1: observation index
normalisation_factors = (
np.sum(
model_fluxes * obs_fluxes[newaxis, :, :] / (
obs_errors * obs_errors)[newaxis, :, :], axis=2
) / np.sum(
model_fluxes * model_fluxes / (
obs_errors * obs_errors)[newaxis, :, :], axis=2)
)
norm_model_fluxes = model_fluxes * normalisation_factors[:, :, newaxis]
# χ² of the comparison of each model to each observation. The
# chi_squares array is:
# axis 0: model index
# axis 1: observation index
chi_squares = np.sum(
np.square(
(obs_fluxes[newaxis, :, :] - norm_model_fluxes) /
obs_errors[newaxis, :, :]
), axis=2
)
# We define the reduced χ² as the χ² divided by the number of fluxes
# used for the fitting.
reduced_chi_squares = chi_squares / obs_fluxes.count(1)[newaxis, :]
# We use the exponential probability associated with the χ² as
# likelihood function. The likelihood array is:
# axis 0: model index
# axis 1: observation index
likelihood = np.exp(-chi_squares/2)
# For the analysis, we consider that the computed models explain each
# observation. We normalise the likelihood function to have a
# total likelihood of 1 for each observation.
likelihood /= np.sum(likelihood, axis=0)[newaxis, :]
# We don't want to take into account the models with a probability
# less that the threshold.
likelihood = np.ma.masked_less(likelihood, MIN_PROBABILITY)
# We re-normalise the likelihood.
likelihood /= np.sum(likelihood, axis=0)[newaxis, :]
# Some model variables depend on the galaxy mass (the normalisation
# factor) so we expand the model_variables array to have a new axis
# corresponding to the observation and multiply (when needed) the
# value of the variables by the normalisation factor of the fitting
# for each observation.
# The new array will have be:
# axis 0: model index
# axis 1: observation index
# axis 2: variable index
model_variables = model_variables[:, newaxis, :].repeat(len(obs_table),
axis=1)
# We take the mass-dependent variable list from the last computed sed.
for index, variable in enumerate(analysed_variables):
if variable in sed.mass_proportional_info:
model_variables[:, :, index] *= normalisation_factors
# We also add the galaxy mass to the analysed variables
analysed_variables.insert(0, "galaxy_mass")
model_variables = np.dstack((normalisation_factors, model_variables))
##################################################################
# Variable analysis #
##################################################################
# We compute the weighted average and standard deviation using the
# likelihood as weight. We first build the weight array by expanding
# the likelihood along a new axis corresponding to the analysed
# variable.
weights = likelihood[:, :, newaxis].repeat(len(analysed_variables),
axis=2)
# Analysed variables average and standard devisation arrays.
# axis 0: observation index
# axis 1: variable index
analysed_averages = np.ma.average(model_variables,
axis=0,
weights=weights)
analysed_variances = np.ma.average(
(model_variables - analysed_averages[newaxis, :, :])**2,
axis=0,
weights=weights)
analysed_std = np.ma.sqrt(analysed_variances)
# Create and save the result table.
result_table = Table()
result_table.add_column(Column(
obs_table["id"].data,
name="observation_id"
))
for index, variable in enumerate(analysed_variables):
result_table.add_column(Column(
analysed_averages[:, index],
name=variable
))
result_table.add_column(Column(
analysed_std[:, index],
name=variable+"_err"
))
result_table.write(OUT_DIR + RESULT_FILE)
##################################################################
# Best models #
##################################################################
# We define the best fitting model for each observation as the one
# with the least χ².
best_model_index = list(reduced_chi_squares.argmin(axis=0))
# We take the list of information added to the SEDs from the last
# computed one.
model_info_names = sed.info.keys()
best_model_table = Table()
best_model_table.add_column(Column(
obs_table["id"].data,
name="observation_id"
))
best_model_table.add_column(Column(
chi_squares[best_model_index, range(len(best_model_index))],
name="chi_square"
))
best_model_table.add_column(Column(
reduced_chi_squares[best_model_index, range(len(best_model_index))],
name="reduced_chi_square"
))
best_model_table.add_column(Column(
normalisation_factors[best_model_index,
range(len(best_model_index))],
name="galaxy_mass",
unit="Msun"
))
for index, name in enumerate(model_info_names):
best_model_table.add_column(Column(
[model_info[model_idx][index] for model_idx
in best_model_index],
name=name
))
best_model_table.write(OUT_DIR + BEST_MODEL_FILE)
# AnalysisModule to be returned by get_module
Module = PdfAnalysis
# -*- coding: utf-8 -*-
# Copyright (C) 2013 Centre de données Astrophysiques de Marseille
# Copyright (C) 2014 Yannick Roehlly <yannick@iaora.eu>
# Licensed under the CeCILL-v2 licence - see Licence_CeCILL_V2-en.txt
# Author: Yannick Roehlly
import numpy as np
from copy import deepcopy
from ...sed.cosmology import cosmology
from ...creation_modules import get_module as get_creation_module
def gen_compute_fluxes_at_redshift(sed, filters, redshifting_module,
igm_module):
""""Generate function to compute the fluxes of a SED at a given redshift
Given a SED, a list of filters and a redshift module name, this generator
returns a function computing the fluxes of the SED in all the filters at
a given redshift. If the SED is older than the Universe at the given
redshift, the fluxes returned by this function will be all -99.
If the redshift module name is None, the SED is not redshifted by the
returned function. This means that it will always return the same fluxes,
whatever its redshift parameter. This can be used for computing
photometric redshifts.
Parameters
----------
sed : pcigale.sed
The pcigale SED object.
filters : list of pcigale.data.filters
List of pcigale filters objects.
redshifting_module : picgale.creation_modules.Module
A pcigale SED creation module to redshift the SED. It must accept a
redshift parameter (or be None).
igm_module : picgale.creation_modules.Module
A pcigale SED creation module to add the IGM attenuation to the SED.
Is not used if the redshifting module is None.
Return
------
gen_fluxes : function
Function computing the fluxes of the SED in all filters at a given
redshift.
"""
# The returned function is memoized.
cache = {}
def gen_fluxes(redshift):
"""Compute the flux of the SED in various filters.
Parameters
----------
redshift : float
Returns
-------
array fo floats
"""
# If the function is generated without a redshift module, it always
# computes the fluxes at the SED redshift (the SED may be already
# redshifted).
if not redshifting_module:
redshift = sed.redshift
if redshift not in cache:
# Age of the Universe at redshift.
# astropy 0.3 cosmology functions return quantities
try:
age_at_redshift = cosmology.age(redshift).value * 1000
except AttributeError:
age_at_redshift = cosmology.age(redshift) * 1000
if sed.info["age"] > age_at_redshift:
cache[redshift] = -99 * np.ones(len(filters))
else:
if redshifting_module:
red_sed = deepcopy(sed)
redshifting_module.parameters["redshift"] = redshift
redshifting_module.process(red_sed)
if igm_module:
igm_module.process(red_sed)
else:
red_sed = sed
cache[redshift] = np.array(
[red_sed.compute_fnu(filt_.trans_table,
filt_.effective_wavelength)
for filt_ in filters])
return cache[redshift]
return gen_fluxes
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment