Commit 31c30672 authored by Médéric Boquien's avatar Médéric Boquien

The initial implementation of the parallel analysis ended up grinding pcigale...

The initial implementation of the parallel analysis ended up grinding pcigale to a halt. This was due to the numerous array transfers between the main process and subprocesses. To solve this problem, we now share arrays through a module. This has the advantage
that thanks to copy-on-write, we almost never need to actually copy arrays. Now all the subprocessing run at full speed. Quite a few changes for a patch that only starting with the activation of cache clearing.
parent 2ea03335
# This is a dummy module to store the data that need to be shared between all
# processes. In pratice, once set these data should never change. Note that we
# cannot initialise thoses variables in subprocesses.
# Variables known at init
warehouse = []
creation_modules = []
creation_modules_params = []
analysed_variables = []
save_best_sed = []
save_chi2 = []
save_pdf = []
filters = []
# Variables that need to be initilised after having computed the models
model_fluxes = []
model_variables = []
model_redshifts = []
model_info = []
mass_proportional_info = []
has_sfh = []
info_keys = []
redshifts = []
w_redshifts = []
......@@ -5,11 +5,14 @@
# Licensed under the CeCILL-v2 licence - see Licence_CeCILL_V2-en.txt
# Author: Yannick Roehlly & Médéric Boquien
from datetime import datetime
import os
from astropy.table import Table, Column
import numpy as np
from scipy.stats import gaussian_kde
from scipy.linalg import LinAlgError
from ...warehouse import SedWarehouse
import pcigale.analysis_modules.myglobals as gbl
# Directory where the output files are stored
OUT_DIR = "out/"
......@@ -185,8 +188,7 @@ def save_table_analysis(obsid, analysed_variables, analysed_averages,
result_table.write(OUT_DIR + RESULT_FILE)
def save_table_best(obsid, chi2, chi2_red, norm,
variables, fluxes, filters, sed):
def save_table_best(obsid, chi2, chi2_red, norm, variables, fluxes, filters):
"""Save the values corresponding to the best fit
Parameters
......@@ -205,21 +207,16 @@ def save_table_best(obsid, chi2, chi2_red, norm,
Fluxes in all bands for each object
filters: list
Filters used to compute the fluxes
sed: SED object
Used to identify what to output in the table
"""
best_model_table = Table()
best_model_table.add_column(Column(obsid.data, name="observation_id"))
best_model_table.add_column(Column(chi2, name="chi_square"))
best_model_table.add_column(Column(chi2_red, name="reduced_chi_square"))
if sed.sfh is not None:
best_model_table.add_column(Column(norm, name="galaxy_mass",
unit="Msun"))
for index, name in enumerate(sed.info.keys()):
for index, name in enumerate(gbl.info_keys):
column = Column([variable[index] for variable in variables], name=name)
if name in sed.mass_proportional_info:
if name in gbl.mass_proportional_info:
column *= norm
best_model_table.add_column(column)
......@@ -228,3 +225,14 @@ def save_table_best(obsid, chi2, chi2_red, norm,
best_model_table.add_column(column)
best_model_table.write(OUT_DIR + BEST_MODEL_FILE)
def backup_dir(directory):
if os.path.exists(directory):
new_name = datetime.now().strftime("%Y%m%d%H%M") + "_" + directory
os.rename(directory, new_name)
print("The existing {} directory was renamed to {}".format(
OUT_DIR,
new_name
))
os.mkdir(OUT_DIR)
......@@ -7,30 +7,21 @@
import numpy as np
from .utils import save_best_sed, save_pdf, save_chi2
import pcigale.analysis_modules.myglobals as gbl
# Probability threshold: models with a lower probability are excluded from
# the moments computation.
MIN_PROBABILITY = 1e-20
def sed(warehouse, creation_modules, model_params, analysed_variables,
filters):
def sed(model_params, changed):
"""Worker process to retrieve a SED and return the relevant data
Parameters
----------
warehouse: SedWarhouse object
Used to retrieve a SED. Ideally a different warehouse should be used
for each forked process but it does not seem to cause any problem so
far
creation_modules: list
List of creation modules to build the SED
model_params: list
Parameters of the creation modules
analysed_variables: list
Names of the analysed variables
filters: list
Filters to compute the SED fluxes
Returns
-------
......@@ -43,57 +34,46 @@ def sed(warehouse, creation_modules, model_params, analysed_variables,
the list returned by starmap anyway.
"""
sed = warehouse.get_sed(creation_modules, model_params)
sed = gbl.warehouse.get_sed(gbl.creation_modules, model_params)
gbl.warehouse.partial_clear_cache(changed)
if 'age' in sed.info and sed.info['age'] > sed.info['universe.age']:
model_fluxes = -99. * np.ones(len(filters))
model_variables = -99. * np.ones(len(analysed_variables))
model_fluxes = -99. * np.ones(len(gbl.filters))
model_variables = -99. * np.ones(len(gbl.analysed_variables))
else:
model_fluxes = np.array([sed.compute_fnu(filter_.trans_table,
filter_.effective_wavelength)
for filter_ in filters.values()])
for filter_ in gbl.filters.values()])
model_variables = np.array([sed.info[name]
for name in analysed_variables])
for name in gbl.analysed_variables])
redshift = sed.info['redshift']
info = sed.info.values()
return model_fluxes, model_variables, redshift, info
def analysis(obs, model_fluxes, model_variables, info, filters, sed,
analysed_variables, creation_modules, creation_modules_params,
save):
def analysis(obs):
"""Worker process to analyse the PDF and estimate parameters values
Parameters
----------
obs: row
Input data for an individual object
model_fluxes: 2D array
Fluxes for each model and for each filter
model_variables: 2D array
Variables values for each model
info: list
sed.info for each model
filters: list
Filters to compute the fluxes
sed: SED object
Used to retrieve which parameters are normalisation dependent
analysed_variables: list
Names of analysed variables
creation_modules: list
Creation modules named to recreate the best SED
creation_modules_params: list
Creation modules parameters to recreate the best SED
save: set
Booleans indicating whether to save the best SED, best χ² and PDF
Returns
-------
The analysed parameters (values+errors), best raw and reduced χ², best
normalisation factor, info of the best SED, fluxes of the best SED
"""
obs_fluxes = np.array([obs[name] for name in filters])
obs_errors = np.array([obs[name + "_err"] for name in filters])
w = np.where(gbl.w_redshifts[gbl.redshifts[np.abs(obs['redshift'] -
gbl.redshifts).argmin()]])
model_fluxes = gbl.model_fluxes[w[0], :]
model_variables = gbl.model_variables[w[0], :]
obs_fluxes = np.array([obs[name] for name in gbl.filters])
obs_errors = np.array([obs[name + "_err"] for name in gbl.filters])
# Some observations may not have flux value in some filters, in
# that case the user is asked to put -9999 as value. We mask these
......@@ -136,16 +116,10 @@ def analysis(obs, model_fluxes, model_variables, info, filters, sed,
# We take the mass-dependent variable list from the last computed
# sed.
for index, variable in enumerate(analysed_variables):
if variable in sed.mass_proportional_info:
for index, variable in enumerate(gbl.analysed_variables):
if variable in gbl.mass_proportional_info:
model_variables[:, index] *= norm_facts
# We also add the galaxy mass to the analysed variables if relevant
if sed.sfh is not None:
analysed_variables.insert(0, "galaxy_mass")
model_variables = np.dstack((norm_facts,
model_variables))
##################################################################
# Variable analysis #
##################################################################
......@@ -154,7 +128,8 @@ def analysis(obs, model_fluxes, model_variables, info, filters, sed,
# likelihood as weight. We first build the weight array by
# expanding the likelihood along a new axis corresponding to the
# analysed variable.
weights = likelihood[:, np.newaxis].repeat(len(analysed_variables), axis=1)
weights = likelihood[:, np.newaxis].repeat(len(gbl.analysed_variables),
axis=1)
# Analysed variables average and standard deviation arrays.
analysed_averages = np.ma.average(model_variables, axis=0,
......@@ -168,19 +143,20 @@ def analysis(obs, model_fluxes, model_variables, info, filters, sed,
# with the least χ².
best_index = chi2_.argmin()
if save[0]:
save_best_sed(obs['id'], creation_modules,
creation_modules_params[best_index],
if gbl.save_best_sed:
save_best_sed(obs['id'], gbl.creation_modules,
gbl.creation_modules_params[best_index],
norm_facts[best_index])
if save[1]:
save_chi2(obs['id'], analysed_variables, model_variables, chi2_red)
if save[2]:
save_pdf(obs['id'], analysed_variables, model_variables, likelihood)
if gbl.save_chi2:
save_chi2(obs['id'], gbl.analysed_variables, model_variables, chi2_red)
if gbl.save_pdf:
save_pdf(obs['id'], gbl.analysed_variables, model_variables,
likelihood)
return (analysed_averages,
analysed_std,
chi2_[best_index],
chi2_red[best_index],
np.array(model_fluxes[best_index, :]), # do NOT remove np.array()
list(gbl.model_info[w][best_index]),
norm_facts[best_index],
list(info[best_index]),
model_fluxes[best_index, :])
chi2_[best_index],
chi2_red[best_index])
......@@ -24,9 +24,9 @@ from astropy.table import Table
from . import AnalysisModule
from ..warehouse import SedWarehouse
from ..data import Database
from .utils import find_changed_parameters
def _worker_sed(warehouse, filters, modules, parameters):
def _worker_sed(warehouse, filters, modules, parameters, changed):
"""Internal function to parallelize the computation of fluxes.
Parameters
......@@ -45,6 +45,7 @@ def _worker_sed(warehouse, filters, modules, parameters):
"""
sed = warehouse.get_sed(modules, parameters)
warehouse.partial_clear_cache(changed)
row = []
......@@ -147,11 +148,13 @@ class SaveFluxes(AnalysisModule):
# Parallel computation of the fluxes
with SedWarehouse(cache_type=parameters["storage_type"]) as warehouse,\
mp.Pool(processes=cores) as pool:
changed_pars = find_changed_parameters(creation_modules_params)
out_rows = pool.starmap(_worker_sed,
zip(repeat(warehouse),
repeat(filter_list),
repeat(creation_modules),
creation_modules_params))
creation_modules_params,
changed_pars))
# The zip call is to convert the list of rows to a list of columns.
out_table = Table(list(zip(*out_rows)), names=out_columns)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment