__init__.py 10.8 KB
Newer Older
1
2
# -*- coding: utf-8 -*-
# Copyright (C) 2013 Centre de données Astrophysiques de Marseille
3
4
# Copyright (C) 2013-2014 Institute of Astronomy
# Copyright (C) 2013-2014 Yannick Roehlly <yannick@iaora.eu>
5
# Licensed under the CeCILL-v2 licence - see Licence_CeCILL_V2-en.txt
6
# Author: Yannick Roehlly & Médéric Boquien
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28

"""
Probability Density Function analysis module
============================================

This module builds the probability density functions (PDF) of the SED
parameters to compute their moments.

The models corresponding to all possible combinations of parameters are
computed and their fluxes in the same filters as the observations are
integrated. These fluxes are compared to the observed ones to compute the
χ² value of the fitting. This χ² give a probability that is associated with
the model values for the parameters.

At the end, for each parameter, the probability-weighted mean and standard
deviation are computed and the best fitting model (the one with the least
reduced χ²) is given for each observation.

"""

import numpy as np
from collections import OrderedDict
29
import multiprocessing as mp
30
31
from ...utils import read_table
from .. import AnalysisModule, complete_obs_table
32
from .utils import save_table_analysis, save_table_best, backup_dir
33
34
from ...warehouse import SedWarehouse
from ...data import Database
35
36
from .workers import sed as worker_sed
from .workers import analysis as worker_analysis
37
38
from ..utils import find_changed_parameters
import pcigale.analysis_modules.myglobals as gbl
39
40

# Tolerance threshold under which any flux or error is considered as 0.
41
TOLERANCE = 1e-12
42
# Limit the redshift to this number of decimals
43
REDSHIFT_DECIMALS = 2
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# Directory where the output files are stored
OUT_DIR = "out/"


class PdfAnalysis(AnalysisModule):
    """PDF analysis module"""

    parameter_list = OrderedDict([
        ("analysed_variables", (
            "array of strings",
            "List of the variables (in the SEDs info dictionaries) for which "
            "the statistical analysis will be done.",
            ["sfr", "average_sfr"]
        )),
        ("save_best_sed", (
            "boolean",
            "If true, save the best SED for each observation to a file.",
            False
        )),
63
        ("save_chi2", (
64
            "boolean",
65
66
            "If true, for each observation and each analysed variable save "
            "the reduced chi².",
67
68
69
70
            False
        )),
        ("save_pdf", (
            "boolean",
71
72
            "If true, for each observation and each analysed variable save "
            "the probability density function.",
73
74
75
76
77
78
79
80
81
82
            False
        )),
        ("storage_type", (
            "string",
            "Type of storage used to cache the generate SED.",
            "memory"
        ))
    ])

    def process(self, data_file, column_list, creation_modules,
83
                creation_modules_params, parameters, cores):
84
85
        """Process with the psum analysis.

86
87
88
89
90
        The analysis is done in two steps which can both run on multiple
        processors to run faster. The first step is to compute all the fluxes
        associated with each model as well as ancillary data such as the SED
        information. The second step is to carry out the analysis of each
        object, considering all models at once.
91
92
93
94
95
96
97
98
99
100
101
102
103
104

        Parameters
        ----------
        data_file: string
            Name of the file containing the observations to fit.
        column_list: list of strings
            Name of the columns from the data file to use for the analysis.
        creation_modules: list of strings
            List of the module names (in the right order) to use for creating
            the SEDs.
        creation_modules_params: list of dictionaries
            List of the parameter dictionaries for each module.
        parameters: dictionary
            Dictionary containing the parameters.
105
106
        core: integer
            Number of cores to run the analysis on
107
108
109

        """

110
111
        print("Initialising the analysis module... ")

112
        # Rename the output directory if it exists
113
114
115
116
117
118
119
120
121
122
123
        backup_dir(OUT_DIR)

        # Get the parameters. They are stored in a shared module so that this
        # can be accessed by subprocesses during the model generation and
        # analysis. This avoids transmitting the exact same data all over again
        gbl.analysed_variables = parameters["analysed_variables"]
        gbl.creation_modules = creation_modules
        gbl.creation_modules_params = creation_modules_params
        gbl.save_best_sed = parameters["save_best_sed"].lower() == "true"
        gbl.save_chi2 = parameters["save_chi2"].lower() == "true"
        gbl.save_pdf = parameters["save_pdf"].lower() == "true"
124
125
126

        # Get the needed filters in the pcigale database. We use an ordered
        # dictionary because we need the keys to always be returned in the
127
128
        # same order. We also put the filters in the shared modules as they
        # are needed to compute the fluxes during the models generation.
129
        with Database() as base:
130
131
132
            gbl.filters = OrderedDict([(name, base.get_filter(name))
                                       for name in column_list
                                       if not name.endswith('_err')])
133
134
135

        # Read the observation table and complete it by adding error where
        # none is provided and by adding the systematic deviation.
136
137
        obs_table = complete_obs_table(read_table(data_file), column_list,
                                       gbl.filters, TOLERANCE)
138
139
140

        print("Computing the models fluxes...")

141
142
143
        # First we get a dummy sed to obtain some information on the
        # parameters, such as which ones are dependent on the normalisation
        # factor of the fit.
144
145
146

        # The SED warehouse is used to retrieve SED corresponding to some
        # modules and parameters.
147

148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
        gbl.warehouse = SedWarehouse(cache_type=parameters["storage_type"])

        sed = gbl.warehouse.get_sed(gbl.creation_modules,
                                    gbl.creation_modules_params[0])
        gbl.info_keys = list(sed.info.keys())
        gbl.mass_proportional_info = sed.mass_proportional_info
        gbl.has_sfh = sed.sfh is not None

        # Then, for all possible theoretical models (one for each paremeter set
        # in creation_module_parameters), we compute 1) the fluxes in all the
        # filters, 2) the values of the model variables, 3) the redshift of the
        # models (this is important for the analysis to easily identify which
        # models correspond to a given redshift), and 4) the sed.info values
        # (without the keys to save space ; this is possible because we are
        # using an OrderedDict so it is easy to retrieve the value for a known
        # key using gbl.info_keys). Note that we could compute the last two
        # elements in the main process but we would lose time doing so. Not
        # necessarily much though. TODO: benchmark this.

        # In order not to clog the warehouse memory with SEDs that will not be
        # used again during the SED generation, we identify which parameter has
        # changed, invalidating  models with this parameter
        changed_pars = find_changed_parameters(creation_modules_params)

        # Compute the SED fluxes and ancillary data in parallel
        with mp.Pool(processes=cores) as pool:
            items = pool.starmap(worker_sed, zip(creation_modules_params,
                                                 changed_pars))

        # We create the arrays to store the model fluxes and ancillary data.
        # They are stored in a shared module so they are easily accessible by
        # the processes that will carry out the analysis, without requiring any
        # data transfer that will grind pcigale to a halt. We use a numpy
        # masked array to mask the fluxes of models that would be older than
        # the age of the Universe at the considered redshift.
        gbl.model_fluxes = np.ma.empty((len(creation_modules_params),
                                        len(gbl.filters)))
        gbl.model_variables = np.ma.empty((len(creation_modules_params),
                                           len(gbl.analysed_variables)))
        gbl.model_redshifts = np.empty(len(creation_modules_params))
        gbl.model_info = [None] * len(creation_modules_params)

        # Unpack the computed data into their respective arrays.
        for idx_item, item in enumerate(items):
            gbl.model_fluxes[idx_item, :] = item[0]
            gbl.model_variables[idx_item, :] = item[1]
            gbl.model_redshifts[idx_item] = item[2]
            gbl.model_info[idx_item] = item[3]

        print('Analysing models...')

        # Mask the invalid fluxes
        gbl.model_fluxes = np.ma.masked_less(gbl.model_fluxes, -90)

        # To make it easy to subprocesses to identify which parts of the
        # various arrays correspond to a given redshift we 1) make a list of
        # model redshifts, and 2) a dictionary indicatin the indices
        # corresponding to a given redshift. These two objects are assigned to
        # the shared module.
        gbl.redshifts = np.unique(gbl.model_redshifts)
        gbl.w_redshifts = {redshift: gbl.model_redshifts == redshift
                           for redshift in gbl.redshifts}

        # Analysis of each object in parallel. All the model data are
        # transmitted through a shared module to avoid memory copies that would
        # grind pcigale to a halt.
214
        with mp.Pool(processes=cores) as pool:
215
216
217
218
            items = pool.starmap(worker_analysis, zip(obs_table))

        # Local arrays where to unpack the results of the analysis
        analysed_averages = np.empty((len(items), len(gbl.analysed_variables)))
219
        analysed_std = np.empty_like(analysed_averages)
220
221
222
223
224
        best_fluxes = np.empty((len(items), len(gbl.filters)))
        best_parameters = [None] * len(items)
        best_normalisation_factors = np.empty(len(obs_table))
        best_chi2 = np.empty_like(best_normalisation_factors)
        best_chi2_red = np.empty_like(best_normalisation_factors)
225
226
227
228

        for item_idx, item in enumerate(items):
            analysed_averages[item_idx, :] = item[0]
            analysed_std[item_idx, :] = item[1]
229
230
231
232
233
            best_fluxes[item_idx, :] = item[2]
            best_parameters[item_idx] = item[3]
            best_normalisation_factors[item_idx] = item[4]
            best_chi2[item_idx] = item[5]
            best_chi2_red[item_idx] = item[6]
234

235
        print("Saving results...")
236

237
        save_table_analysis(obs_table['id'], gbl.analysed_variables,
238
                            analysed_averages, analysed_std)
239
240
241
242
243
        save_table_best(obs_table['id'], best_chi2, best_chi2_red,
                        best_normalisation_factors, best_parameters,
                        best_fluxes, gbl.filters)

        print("Run completed!")
244

245
246
# AnalysisModule to be returned by get_module
Module = PdfAnalysis