Commit 1ba472f3 authored by Médéric Boquien's avatar Médéric Boquien
Browse files

Finally remove psum. Now the standard analysis module should be pdf_analysis.

parent 857997c1
# -*- coding: utf-8 -*-
# Copyright (C) 2012, 2013 Centre de données Astrophysiques de Marseille
# Licensed under the CeCILL-v2 licence - see Licence_CeCILL_V2-en.txt
# Author: Yannick Roehlly
"""
CIGALE psum analysis module
===========================
This file implements the statistical analysis as performed by the calcX2_psum
programme of the Fortran Cigale code.
The models corresponding to all possible combinations of parameters are
computed are the integrated flux in the same filters as the observations are
used to compute the χ² of the fitting. This χ² give a probability that is
associated with the model values for the parameters. At the end, for each
parameter, the (probability) weighted mean and standard deviation are computed
and the best fitting model (the one with the least reduced χ²) is given.
"""
import os
import json
import numpy as np
from itertools import product
from astropy.table import Table, Column
from collections import OrderedDict
from datetime import datetime
from copy import deepcopy
from scipy import stats
from progressbar import ProgressBar
from matplotlib import pyplot as plt
from ..utils import read_table
from . import AnalysisModule
from ..creation_modules import get_module
from ..warehouse import SedWarehouse
from ..data import Database
# Tolerance threshold under which any flux or error is considered as 0.
TOLERANCE = 1.e-12
# Name of the fits file containing the results
RESULT_FILE = 'psum_results.fits'
# Directory where the output files are stored
OUT_DIR = 'out/'
# Wavelength limits (restframe) when plotting the best SED.
PLOT_L_MIN = 91
PLOT_L_MAX = 1e6
class Psum(AnalysisModule):
"""psum analysis"""
parameter_list = OrderedDict([
("analysed_variables", (
"array of strings",
"List of the variables (in the SEDs info dictionaries) for which "
"the statistical analysis will be done.",
["sfr", "average_sfr"]
)),
("save_best_sed", (
"boolean",
"If true, save the best SED for each observation to a file.",
False
)),
("plot_best_sed", (
"boolean",
"If true, for each observation save a plot of the best SED "
"and the observed fluxes.",
False
)),
("plot_chi2_distribution", (
"boolean",
"If true, for each observation and each analysed variable "
"plot the value vs reduced chi-square distribution.",
False
)),
("save_pdf", (
"boolean",
"If true, for each observation and each analysed variable "
"save the probability density function.",
False
)),
("plot_pdf", (
"boolean",
"If true, for each observation and each analysed variable "
"plot the probability density function.",
False
)),
("pdf_max_bin_number", (
"integer",
"Maximum number of bins used to compute the probability density "
"function. This is only used when saving or printing the PDF. "
"If there are less values, the probability is given for each "
"one.",
50
)),
("storage_type", (
"string",
"Type of storage used to cache the generate SED.",
"memory"
))
])
def process(self, data_file, column_list, creation_modules,
creation_modules_params, parameters):
"""Process with the psum analysis.
The analysis is done in two nested loops: over each observation and
over each theoretical SEDs. We first loop over the SEDs to limit the
number of time the SEDs are created.
Parameters
----------
data_file: string
Name of the file containing the observations to fit.
column_list: list of strings
Name of the columns from the data file to use for the analysis.
creation_modules: list of strings
List of the module names (in the right order) to use for creating
the SEDs.
creation_modules_params: list of dictionaries
List of the parameter dictionaries for each module.
parameters: dictionary
Dictionary containing the parameters.
"""
if os.path.exists(OUT_DIR):
new_name = datetime.now().strftime("%Y%m%d%H%M") + "_" + OUT_DIR
os.rename(OUT_DIR, new_name)
print("The existing {} directory was renamed to {}".format(
OUT_DIR,
new_name
))
os.mkdir(OUT_DIR)
# Open the warehouse
sed_warehouse = SedWarehouse(
cache_type=parameters["storage_type"])
# Get the parameters
analysed_variables = parameters["analysed_variables"]
save_best_sed = (parameters["save_best_sed"].lower() == "true")
plot_best_sed = (parameters["plot_best_sed"].lower() == "true")
plot_chi2_distribution = (
parameters["plot_chi2_distribution"].lower() == "true")
save_pdf = (parameters["save_pdf"].lower() == "true")
plot_pdf = (parameters["plot_pdf"].lower() == "true")
pdf_max_bin_number = int(parameters["pdf_max_bin_number"])
results = {'galaxy_mass': [], 'galaxy_mass_err': []}
for variable in analysed_variables:
results[variable] = []
results[variable + '_err'] = []
# We will save a table containing the best SED corresponding to each
# observation. This table contains the parameters of the SED as well as
# the value for the analysed variables.
out_bestsed_columns = ["id"]
for module_param_list in zip(creation_modules,
creation_modules_params[0]):
for module_param in product([module_param_list[0]],
module_param_list[1].keys()):
out_bestsed_columns.append(".".join(module_param))
out_bestsed_columns += analysed_variables
out_bestsed_rows = []
# We get the transmission table and effective wavelength for each
# used filter.
filter_list = [name for name in column_list
if not name.endswith('_err')]
transmission = {}
effective_wavelength = {}
with Database() as base:
for name in filter_list:
filt = base.get_filter(name)
transmission[name] = filt.trans_table
effective_wavelength[name] = filt.effective_wavelength
# We get the redshift and igm modules.
redshift_module = get_module("redshifting", redshift=0)
igm_module = get_module("igm_attenuation")
# Read the observation table and complete it by adding error where
# none is provided and by adding the systematic deviation.
obs_table = read_table(data_file)
for name in filter_list:
name_err = name + '_err'
if name_err not in column_list:
if name_err not in obs_table.columns:
obs_table.add_column(Column(name=name_err, data=
np.zeros(np.shape(obs_table))))
else:
obs_table[name_err] = np.zeros(np.shape(obs_table))
obs_table[name_err] = adjust_errors(obs_table[name],
obs_table[name_err])
# As we loop fist on the models (to limit the number of times a model
# is computed) we need to store the computation results to make the
# analysis for each observation in a second time. For this, we use a
# three axis numpy array: axis 1 is the model (based on the index of
# the sed_module_params list), axis 2 is the observation (base on the
# data table row index) and axis 3 is the considered variable (based
# on the analysed variables list + reduced_chi2, probability and
# galaxy_mass at the beginning).
comp_table = np.zeros((len(creation_modules_params),
np.shape(obs_table)[0],
len(analysed_variables) + 3), dtype=float)
comp_table[:, :, :] = np.nan
# We loop over all the possible theoretical SEDs
progress_bar = ProgressBar(maxval=len(creation_modules_params)).start()
for model_index, parameters in enumerate(creation_modules_params):
sed = sed_warehouse.get_sed(creation_modules, parameters)
# Compute the reduced Chi-square, the galaxy mass (normalisation
# factor) and probability for each observed SEDs. Add these and
# the values for the analysed variable to the comp_table.
for obs_index in range(np.shape(obs_table)[0]):
obs_redshift = obs_table['redshift'][obs_index]
obs_fluxes = [obs_table[name][obs_index]
for name in filter_list]
obs_errors = [obs_table[name + '_err'][obs_index]
for name in filter_list]
# We copy the SED before redshifting it.
red_sed = deepcopy(sed)
redshift_module.parameters["redshift"] = obs_redshift
redshift_module.process(red_sed)
igm_module.process(red_sed)
# Theoretical fluxes
theor_fluxes = [red_sed.compute_fnu(transmission[name],
effective_wavelength[name])
for name in filter_list]
reduced_chi2, galaxy_mass, probability = compute_chi2(
theor_fluxes, obs_fluxes, obs_errors)
comp_table[model_index, obs_index, 0] = reduced_chi2
comp_table[model_index, obs_index, 1] = probability
comp_table[model_index, obs_index, 2] = galaxy_mass
for index, variable in enumerate(analysed_variables):
if variable in sed.mass_proportional_info:
comp_table[model_index, obs_index, index + 3] = \
galaxy_mass * sed.info[variable]
else:
comp_table[model_index, obs_index, index + 3] = \
sed.info[variable]
progress_bar.update(model_index + 1)
progress_bar.finish()
# Loop over the observations to find the best fitting model and
# compute the parameter statistics.
for obs_index, obs_name in enumerate(obs_table['id']):
# Convert the observation name to string, in case it is a number.
obs_name = str(obs_name)
# We will save the part of the computation table corresponding to
# the model as a FITS file.
fits_table = Table()
fits_table.table_name = "Analysis computation table"
fits_table.add_column(Column(name="reduced_chi-square",
data=comp_table[:, obs_index, 0]))
fits_table.add_column(Column(name="probability",
data=comp_table[:, obs_index, 1]))
# Find the model corresponding to the least reduced Chi-square;
# if there more than one model with the minimal chi-square value
# only the first is returned.
best_index = comp_table[:, obs_index, 0].argmin()
best_chi2 = comp_table[best_index, obs_index, 0]
best_norm_factor = comp_table[best_index, obs_index, 2]
best_params = creation_modules_params[best_index]
best_sed = sed_warehouse.get_sed(creation_modules, best_params)
# We need to pass the SED through the redshit/IGM module before
# plotting it.
obs_redshift = obs_table['redshift'][obs_index]
redshift_module.parameters["redshift"] = obs_redshift
redshift_module.process(best_sed)
# Add the best SED to the best SED output table
bestsed_line = [obs_name]
for module_param in parameters:
for value in module_param.values():
if type(value) == list:
value = ".".join(value)
bestsed_line.append(value)
bestsed_line += [best_sed.info[item] for item in analysed_variables]
out_bestsed_rows.append(bestsed_line)
# Save best SED
# TODO: For now, we only save the lambda vs fnu table. Once
# we develop a way to serialise the SED, we should save the
# complete SED object.
if save_best_sed:
best_sed_table = Table()
best_sed_table.table_name = "Best SED"
best_sed_table.add_column(Column(name="wavelength",
data=best_sed.wavelength_grid,
unit="nm"))
best_sed_table.add_column(Column(name="fnu",
data=best_norm_factor
* best_sed.fnu,
unit="mJy"))
best_sed_table.write(OUT_DIR + obs_name + 'bestSED.fits',
format='fits')
# Write the SED modules parameters to a file
with open(OUT_DIR + obs_name + "bestSED.params", "w") as f:
f.write(json.dumps(list(zip(creation_modules, best_params)),
indent=2,
separators=(',', ': ')))
# Plot the best SED
if plot_best_sed:
figure = plt.figure()
ax = figure.add_subplot(111)
plot_x, plot_y = best_sed.wavelength_grid, best_sed.fnu
plot_mask = (
(plot_x >= PLOT_L_MIN * (1 + obs_redshift)) &
(plot_x <= PLOT_L_MAX * (1 + obs_redshift))
)
ax.loglog(plot_x[plot_mask],
best_norm_factor * plot_y[plot_mask],
'-b')
ax.loglog([effective_wavelength[name] for name in filter_list],
[obs_table[name][obs_index] for name in filter_list],
'or')
ax.set_xlabel('Wavelength [nm]')
ax.set_ylabel('Flux [mJy]')
ax.set_title(obs_name +
' best fitting SED - reduced chi2:' +
str(best_chi2))
figure.savefig(OUT_DIR + obs_name + '_bestSED.pdf')
plt.close(figure)
# Compute the statistics for the desired variables. First, we
# build the probability distribution function (PDF) for the
# variable, then we use it to compute the expected value and
# the standard deviation.
for index, variable in enumerate(['galaxy_mass'] +
analysed_variables):
# The variable index in the last axis of comp_table is
# index+2 as chi2 and probability are at the beginning.
# values at the beginning.
idx = index + 2
values = comp_table[:, obs_index, idx]
probabilities = comp_table[:, obs_index, 1]
fits_table.add_column(Column(name=variable, data=values))
# We compute the Probability Density Function by binning the
# data into evenly populated bins. To estimate the binning
# quality, we also provide the standard deviation of the
# variable and probability values in the bin.
pdf_values = []
pdf_probs = []
pdf_value_sigma = []
pdf_prob_sigma = []
# The maximum number of bins is the least between
# pdf_max_bin_number and the number of distinct values
# for the analyse variable.
pdf_bin_number = min(pdf_max_bin_number,
len(np.unique(values)))
pdf_bin_boundaries, pdf_bins = bin_evenly(values,
pdf_bin_number)
pdf_bin_sizes = [
pdf_bin_boundaries[i + 1] - pdf_bin_boundaries[i]
for i in range(pdf_bin_number)
]
for bin in range(1, pdf_bin_number + 1):
# The bin probability is the average of the probabilities
# in the bin.
pdf_probs.append(
np.average(probabilities[pdf_bins == bin]))
pdf_prob_sigma.append(
np.std(probabilities[pdf_bins == bin])
)
pdf_values.append(
(pdf_bin_boundaries[bin] +
pdf_bin_boundaries[bin - 1]) / 2
)
pdf_value_sigma.append(
np.std(values[pdf_bins == bin])
)
pdf_probs = np.array(pdf_probs)
pdf_prob_sigma = np.array(pdf_prob_sigma)
pdf_values = np.array(pdf_values)
pdf_value_sigma = np.array(pdf_value_sigma)
# We normalise the PDF to 1 over all the parameter values.
pdf_probs = pdf_probs / np.sum(pdf_bin_sizes * pdf_probs)
# From the PDF, compute the expected value and standard
# deviation.
expected_value = np.sum(pdf_values * pdf_bin_sizes * pdf_probs)
sigma = np.sqrt(
np.sum((pdf_values - expected_value) ** 2 *
pdf_bin_sizes *
pdf_probs)
)
results[variable].append(expected_value)
results[variable + '_err'].append(sigma)
# We plot all the (value, reduced_chi2) tuples.
if plot_chi2_distribution:
figure = plt.figure()
ax = figure.add_subplot(111)
ax.plot(values,
comp_table[:, obs_index, 0],
'.')
ax.set_xlabel('value')
ax.set_ylabel('reduced chi square')
ax.set_title(variable)
figure.savefig(OUT_DIR +
obs_name + '_' + variable + '_chi2plot.pdf')
plt.close(figure)
if save_pdf:
pdf_table = Table()
pdf_table.table_name = "Probability Density Function"
pdf_table.add_column(Column(name="bin_start",
data=pdf_bin_boundaries[:-1]))
pdf_table.add_column(Column(name="bin_end",
data=pdf_bin_boundaries[1:]))
pdf_table.add_column(Column(name="value", data=pdf_values))
pdf_table.add_column(Column(name="value_sigma",
data=pdf_value_sigma))
pdf_table.add_column(Column(name="probability",
data=pdf_probs))
pdf_table.add_column(Column(name="probability_sigma",
data=pdf_prob_sigma))
pdf_table.write(OUT_DIR + obs_name + "_" + variable +
"_pdf.fits", format='fits')
if plot_pdf:
figure = plt.figure()
ax = figure.add_subplot(111)
ax.bar(
pdf_bin_boundaries[:-1],
pdf_probs,
pdf_bin_sizes
)
ax.axvline(expected_value, color="r")
ax.axvline(expected_value - sigma,
color="r",
linestyle="-.")
ax.axvline(expected_value + sigma,
color="r",
linestyle="-.")
ax.set_title(obs_name + ' ' + variable + ' PDF')
ax.set_xlabel(variable)
ax.set_ylabel('Probability')
figure.savefig(OUT_DIR + obs_name + "_" + variable +
"_pdf.pdf")
plt.close(figure)
# Write the computation table FITS
fits_table.write(OUT_DIR + obs_name + '_comptable.fits', format='fits')
# Write the results to the fits file
result_table = Table()
result_table.table_name = "Analysis results"
result_table.add_column(Column(name='id', data=obs_table['id']))
for variable in (['galaxy_mass'] + analysed_variables):
result_table.add_column(Column(name=variable,
data=results[variable]))
result_table.add_column(Column(name=variable + '_err',
data=results[variable + '_err']))
result_table.write(OUT_DIR + RESULT_FILE)
# Write the best SED table
out_bestsed_table = Table(list(zip(*out_bestsed_rows)),
names=out_bestsed_columns)
out_bestsed_table.write(OUT_DIR + "bestSeds.xml", format="votable")
sed_warehouse.close()
def adjust_errors(flux, error, default_error=0.1, systematic_deviation=0.1):
"""Adjust the errors replacing the 0 values by the default error and
adding the systematic deviation.
The systematic deviation change the error to:
sqrt( error² + (flux * deviation)² )
Parameters
----------
flux : array of floats
Fluxes.
error : array of floats
Observational error in the same unit as the fluxes.
default_error : float
Default error factor used when the provided error in under the
tolerance threshold.
systematic_deviation : float
Systematic deviation added to the error.
Returns
-------
error : array of floats
The corrected errors.
"""
# The arrays must have the same lengths.
if len(flux) != len(error):
raise ValueError("The flux and error arrays must have the same "
"length.")
# We copy the error array not to modify the original one.
error = np.copy(error)
# Replace errors below tolerance by the default one.
error[error < TOLERANCE] = (default_error * flux[error < TOLERANCE])
# Add the systematic error.
error = np.sqrt(np.square(error) + np.square(flux * systematic_deviation))
return error
def compute_chi2(model_fluxes, obs_fluxes, obs_errors):
"""Compute chi square value and normalisation factor for the comparison
of a model fluxes to observational ones.
Parameters
----------
model_fluxes : array of floats
Model fluxes.
obs_fluxes : array of floats
Observation fluxes for the same filters as the model ones and
in the same unit.
obs_errors : array of floats
Error the observation flux. The error must be Gaussian for the
chi-square to be meaning full.
Returns
-------
chi2_reduced : float
Reduced chi square value for the comparison. The maximum Chi square
value returned is 99.
normalisation_factor : float
Normalisation factor that must be applied to the model to fit the
observation.
probability: float
Probability associated with the chi-square.
"""
# The three arrays must have the same length.
if (len(model_fluxes) != len(obs_fluxes) or
len(obs_fluxes) != len(obs_errors)):
raise ValueError("The model fluxes, observation fluxes and "
"observation errors arrays must have the "
"same length.")
# We copy the arrays not to modify the original ones.
model_fluxes = np.copy(model_fluxes)
obs_fluxes = np.copy(obs_fluxes)
obs_errors = np.copy(obs_errors)
# If no observed flux is over the tolerance threshold, or if any error,
# for valid fluxes, is under the threshold then the observation is set
# as not fitting at all.
if (max(obs_fluxes) < TOLERANCE or
min(obs_errors[obs_fluxes > TOLERANCE]) < TOLERANCE):
reduced_chi2 = 99
normalisation_factor = 1
probability = 0
else:
# We make the computation using only the filters for which the
# observation error is over the tolerance threshold.
(model_fluxes, obs_fluxes, obs_errors) = \
(model_fluxes[obs_errors > TOLERANCE],
obs_fluxes[obs_errors > TOLERANCE],
obs_errors[obs_errors > TOLERANCE])
# We consider that we fit the observation to the SED, independently of
# the number of parameters used to build it. So the number of degrees
# of freedom depends only on the number of fluxes.
degrees_of_freedom = len(model_fluxes) - 1
if degrees_of_freedom == 0:
#FIXME
reduced_chi2 = 0
normalisation_factor = np.sum(obs_fluxes) / np.sum(model_fluxes)
probability = 1
else:
normalisation_factor = (np.sum(obs_fluxes * model_fluxes /
(obs_errors * obs_errors)) /
np.sum(model_fluxes * model_fluxes /
(obs_errors * obs_errors)))
norm_model_fluxes = normalisation_factor * model_fluxes
chi2 = np.sum(np.square((obs_fluxes - norm_model_fluxes) /
obs_errors))
reduced_chi2 = chi2 / degrees_of_freedom
reduced_chi2 = min(reduced_chi2, 99)