S.Sénési for Météo-France - sept 2019 to march 2021
data_versions[experiment][variable][table][model][variant]=(grid,version,data_period)
do_test=True
data_versions_tag = "20201210"
data_versions_dir = "./"
experiments = ["piControl","historical","ssp126","ssp245","ssp585","ssp119"]
variables = {
"Amon": ["pr","tas","prw","evspsbl"],
"Lmon": ["mrro","mrso","mrsos"],
"Omon": ["sos"],
"day" : ["pr"]
}
periods = {
"historical" : "18500101-20141230", # Do not include 20141231 thanks to the british guys !
"ssp119" : "2015-2099" , #should end at 2100, but, at some date, CAMS-1 did not produce last year
"ssp126" : "2015-2099" ,
"ssp245" : "2015-2099" ,
"ssp585" : "2015-2099" ,
"ssp370" : "2015-2099" ,
"piControl" : "*"
}
# we relax the AR5 criterion of 100 years for spinup followed by 400 years
# by suppressing the 100 years criterion,
piControl_minimum_duration = 400
# For models providing some variables in multiple grids, we choose one or two
# (actually used for some ocean variables, as "sos", and for GFDL-CM4 for sos and pr_day)
preferred_grids = {
"CESM2-WACCM":"gn","CESM2-WACCM-FV":"gn","CESM2-WACCM-FV2":"gn","CESM2-FV2":"gn","CESM2":"gn",
"CNRM-CM6-1":"gr1","CNRM-ESM2-1":"gr1",
"GFDL-ESM4":"gn","GFDL-CM4":["gn","gr1"],
"IPSL-CM6A-LR" : "gr1",
"MRI-ESM2-0" : "gn"}
print_all = False
# Climaf version >= 1.2.13 (see https://climaf.readthedocs.io)
climaf_lib = "/home/ssenesi/climaf_installs/running"
# AR6/WGI/chapter8 CliMAF-based package
CAMMAC = "/home/ssenesi/CAMMAC"
if do_test :
data_versions_tag = "20201210_test"
#experiments=["piControl","ssp245"]
experiments=["piControl"]
#variables={"Amon": ["pr","evspsbl"],}
#experiments=["ssp245"]
variables={"Amon": ["evspsbl"],}
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import datetime, glob, json, sys, copy
#
from climaf.api import *
from climaf.period import init_period
from climaf.cache import stamping
climaf.cache.stamping=False
from CAMMAClib.ancillary import feed_dic
from CAMMAClib.mips_et_al import institute_for_model, mip_for_experiment, models_for_experiments,\
read_versions_dictionnary
def list_models_and_institute() :
global institutes
institutes=dict()
pat="/bdd/CMIP6/CMIP/*/*"
alls=glob.glob(pat)
i=dict()
models=set()
for e in alls :
m=e.split("/")[5]
models.add(m)
i[m]=e.split("/")[4]
for m in sorted(list(models)):
print "%-20s %s"%(m,i[m])
institutes[m]=i[m]
#
#list_models_and_institute()
def print_models_with_more_than_one_grid_for_sos() :
from CAMMAClib.mips_et_al import models_data
for m in models_data :
p=models_data[m][0]
if p != "CMIP6" :
continue
i=models_data[m][1]
pat="/bdd/CMIP6/CMIP/%s/%s/historical/*"%(i,m)
reals=glob.glob(pat)
if len(reals) > 0 :
grids=set()
for r in reals :
grs=glob.glob("%s/Omon/sos/*"%r)
#grs=glob.glob("%s/day/pr/*"%r)
for g in grs :
gg=g.split("/")[10]
grids.add(gg)
if len(list(grids)) > 1:
print "%-20s "%m,list(grids)
else :
print "No historical data for =",m
#print_models_with_more_than_one_grid_for_sos()
def list_CMIP6_models_that_run(experiment):
print "Identifying models that ran %s"%experiment
mip=mip_for_experiment(experiment)
exps=glob.glob("/bdd/CMIP6/%s/*/*/%s"%(mip,experiment))
models = [ e.split("/")[5] for e in exps ]
print "Done"
return models
def check_that_institutes_are_known() :
piControl_models=list_CMIP6_models_that_run("piControl")
print len(piControl_models)
ok=True
for m in piControl_models :
try :
i=institute_for_model(m)
except :
ok=False
print("Must document institute for %s"%m)
if not ok :
raise ValueError("Must document some institute(s)")
def select_all_attributes_combinations_with_data(all_cases,resolve_issues,model,\
experiment,variable,table,period,project="CMIP6",imposed_realization=None):
"""
Feed all_cases[MODEL][GRID][REALIZATION][VERSION] with the
corresponding dataset objects for VARIABLE in PROJECT and EXPERIMENT, ensuring there is
no hole in data coverage, and complying with IMPOSED_REALIZATION (if not None)
Does not actually check that PERIOD is covered by datafiles
Will feed entries for various grids, versions and realizations (provided data exist)
Also feed dic resolve_issues with datasets showing a problem at 'resolve' stage (usually
holes in data coverage)
"""
d=ds(project=project, experiment=experiment,model=model, institute="*",
period=period, variable=variable, table=table,
version="*", mip=mip_for_experiment(experiment),realization="*" )
try :
choices=d.explore('choices')
except :
print "error with %s"%repr(d)
raise ValueError("error with %s"%repr(d))
if len(choices)==0 :
return
#print model,experiment,choices
grids=choices['grid']
if type(grids)!= type([]) : grids=[grids]
#
versions=choices['version']
if type(versions)==type([]) :
if 'latest' in versions : versions.remove('latest')
else : versions=[versions]
#
if imposed_realization is not None :
if imposed_realization not in choices['realization'] :
raise ValueError("Imposed realization %s not found for %s %s %s in %s"%\
(imposed_realization,model,experiment,variable,choices['realization']))
else:
realizations=[imposed_realization]
else:
realizations=choices['realization']
if type(realizations) != type([]) : realizations=[realizations]
#
got_some=False
for g in grids :
for v in versions:
for r in realizations :
d=ds(project=project, experiment=experiment,model=model,
institute=institute_for_model(model),
period=period, variable=variable, table=table,
version=v, mip=mip_for_experiment(experiment),realization=r, grid=g )
try :
d=d.explore('resolve')
# Not all combinations or grid, realization and version have data
if d.baseFiles() is not None:
feed_dic(all_cases,d,model,g,r,v)
got_some=True
except :
#print "--> Issue (holes?) for ",model, experiment, variable, table, g,r,v
feed_dic(resolve_issues,`d`,model,g,r,v)
continue
return got_some
def select_versions(cases,selection,coverage_issues,model,experiment,variable,table,period,project):
"""
Scrutinize all datasets in CASES[MODEL][GRID][REALIZATION][VERSION],
selecting the prefered grid for MODEL (if applicable), then looping across realizations and their
versions for identifying one version per realization (with prioriy to the newest) which datafiles
actually covers PERIOD
The result is stored in SELECTION[MODEL][REALIZATION] as a triplet (grid,version,covered_data_period)
For piControl, PERIOD="*" is interpreted as : data coverage is longer than global variable
piControl_minimum_length
Pairs with uncomplete data coverage are stored in COVERAGE_ISSUES, but only until (in each loop
on versions) a correct version is found
"""
if experiment != "piControl" and period == "*" :
raise ValueError("Cannot process period * except for piControl")
#
# Check there is no ambiguity on grid
grids=cases[model].keys()
if len(grids) > 1 :
gr=None
if model in preferred_grids :
gr=preferred_grids[model]
if type(gr)==type([]) :
gr=set(gr).intersection(set(grids))
if len(gr)==1 :
gr=gr.pop()
else :
gr=None
if gr is None or gr not in grids :
print "!!! multiple grids for ",model,experiment, variable, table, ":",grids
raise ValueError("!!! multiple grids for %s %s %s %s : %s. Please update dic preferred_grids"%\
(model,experiment, variable, table,grids))
else :
gr=grids[0]
#
reals=cases[model][gr].keys()
found_one_realization=False
for real in reals :
ok=False
holes=dict()
versions=cases[model][gr][real].keys()
versions.sort()
sversions=[ v for v in versions ]
#
while len(versions)>0 and not ok :
# We loop on versions, beginning with highest value (i.e. latest one)
last=versions[-1]
# Check available period is OK
d=ds(project=project, experiment=experiment,model=model, institute=institute_for_model(model),
period="*", variable=variable, table=table,
version=last, mip=mip_for_experiment(experiment),realization=real, grid=grids[0] )
data_period=d.explore('choices',operation='union').get('period',None)
speriod=data_period
#print "model=",model,"data_period=",data_period,type(data_period),type(data_period[0])
if data_period is None :
holes[last]=`data_period`
#raise ValueError ("Logic issue : holes==None actually used for %s"%repr(d))
else:
data_period=data_period[0]
actual_period=None
if period != "*" :
# This is the case of experiments with a defined begin and end (as coded in 'period')
if data_period.includes(climaf.classes.init_period(period)) :
actual_period=data_period
else :
# piControl case
if (data_period.end - data_period.start).days >= piControl_minimum_duration * 360 :
actual_period=data_period
if actual_period is None :
holes[last]=`speriod`
else :
ok=True
if not ok :
feed_dic(coverage_issues,holes,model,gr,real,last)
versions.remove(last)
if ok :
feed_dic(selection,(grids[0],last,data_period.pr()),model,real)
found_one_realization=True
#print "%-20s %s: use %s (among %s : %s "%(model,real, last, len(sversions),sversions)
else :
#print "%-20s %s: no valid version among %s : %s "%(model,real, len(sversions),sversions)
pass
if found_one_realization is False :
print "no valid realization for ",model,experiment,variable,table,period
else :
return True
def write_versions_dic(d,tag) :
jsfile="%s/Data_versions_selection_%s.json"%(data_versions_dir,tag)
with open(jsfile,"w") as f :
json.dump(d,f,separators=(',', ': '),indent=3,ensure_ascii=True)
print "Data versions dictionnary written as "+jsfile
#write_versions_dic("201902071814")
def read_versions_dic(tag) :
jsfile="%s/Data_versions_selection_%s.json"%(data_versions_dir,tag)
with open(jsfile,"r") as f :
rep=json.load(f)
return rep
def read_all_versions_dic(tag):
select = read_versions_dic(tag)
all_holes = read_versions_dic(tag+"_holes")
all_coverages = read_versions_dic(tag+"_resolve")
noreals = read_versions_dic( tag+"_noreals")
return select, all_versions, all_holes, all_coverages, noreals
#dics=read_all_versions_dic("20200529t")
def select_models_data_versions(experiment,variable="pr",table="Amon",period=None,
imposed_realizations={},models=None,do_select=True):
"""
For CMIP6 model data, identify models which ran EXPERIMENT (or use provided list of models)
and which produced VARIABLE in TABLE.
For each realization (or for the single realization indicated for each model in dict
IMPOSED_REALIZATIONS), identify latest data version which covers PERIOD (which defaults
to experiment duration, or a long enough period for piControl)
IMPOSED_REALIZATIONS is a dict with keys=model, or an empty dict for not using the feature.
It is not used for experiment piControl
4 dics and a list are returned :
- cases[model][grid][realization][version] = the corresponding dataset object for all available
datasets
- selection[model][realization]=(grid,version,period) : provides the selected versions and
grids, and documents the period, for all datasets with the latest possible available version
with enough data
- holes[model][grid]realization][version] = printable description of the data holes (for each
combination showing some hole)
- coverage_issues[model][grid]realization][version] = printable description of the available
data period for those case where this period is too short
- no_real = list of models with no valid realization for that variable and experiment
"""
print
print "Processing experiment %s, variable %s, table %s"%(experiment,variable,table)
print 50*"*"
if models is None :
models=list_CMIP6_models_that_run(experiment)
#
if period is None :
period=periods[experiment]
#
cases=dict()
selection=dict()
holes=dict()
coverage_issues=dict()
no_real=[]
project="CMIP6"
#
for model in models :
#print model,
#print model,variable,table
imposed=imposed_realizations.get(model,None)
if experiment=="piControl" : imposed=None
#
got_some=select_all_attributes_combinations_with_data(cases,holes,model,experiment,\
variable,table,period,project,imposed)
#print cases
if do_select :
ok=None
if got_some and model in cases :
ok=select_versions(cases,selection,coverage_issues,model,experiment,variable,\
table,period,project)
if ok is None :
no_real.append(model)
print
print "models with no realization showing complete data for %s %s : "%(experiment,variable),no_real
#
return cases, selection, holes, coverage_issues, no_real
#clog('error')
#c,ok,ri,ci,noreal=select_models_data_versions("ssp585",variable="pr",table="Amon",period=None)#,models=["FGOALS-f3-L"])
def add_entries_for_P_E(d):
"""
Create P-E entries in versions dict d , when there are entries for 'pr' and 'evspsbl' with
same realization
Assumes structure of d is those of dict "selection" returned by select_models_data_versions()
For piControl, if evspsbl's period does not include pr's one, use intersection period both
for pr and P-E, if it is long enough
"""
#
# Add entry for P-E by duplicating entry for pr and then removing entries which do not have
# the matching realization in evspsbl entries
for exp in d :
if not ('pr' in d[exp] and "Amon" in d[exp]['pr'] and 'evspsbl' in d[exp]) :
continue
for model in d[exp]['pr']['Amon']:
for real in d[exp]['pr']['Amon'][model] :
print "processing ",exp,model,real,
if model in d[exp]["evspsbl"]["Amon"] and real in d[exp]["evspsbl"]["Amon"][model]:
pgrid,pversion,pperiod=d[exp]['pr']['Amon'][model][real]
egrid,eversion,eperiod=d[exp]['pr']['Amon'][model][real]
if pgrid != egrid :
print "issue on grids",pgrid,egrid
continue
#
pperiod=init_period(pperiod)
eperiod=init_period(eperiod)
#
if exp != "piControl" or eperiod.includes(pperiod) :
print "not piControl, OK "
feed_dic(d,(pgrid,pversion,repr(pperiod)),exp,"P-E","Amon",model,real)
else :
# Need to use intersection of periods for 'pr', for correct
# computation of P-E by CliMAF
inter=pperiod.intersects(eperiod)
print "piControl", inter,
if (inter.end - inter.start).years >= piControl_minimum_duration:
print "OK"
feed_dic(d,(pgrid,pversion,repr(inter)),exp,"pr","Amon",model,real)
feed_dic(d,(pgrid,pversion,repr(inter)),exp,"P-E","Amon",model,real)
else :
print "NOK",(inter.end - inter.start).years
print "For %20s : % 2d pr entries , % 2d P-E entries"%(exp,entries_number(d,exp,"pr"),entries_number(d,exp,"P-E"))
return True
def entries_number(d,exp,variable,table="Amon") :
try :
dic=d[exp][variable][table]
except :
return 0
count=0
for model in dic :
count += len(dic[model].keys())
#print "%s %s,"%(model,l),
return count
sto=sys.stdout
#sys.stdout=sto
def create_versions_dictionnary(experiments,variables,outfile=True,models=None):
"""
Scrutinize available data on file system for :
- a list of experiments,
- a dict of list of variables per table
- a list of models (if provided) or all models
Returns (and writes as json files) 5 dicts such as the 4 dicts and the list returned by
select_models_data_versions() (except for augmenting the dicts structure to systematically
have keys : [experiment][variable][table][model] at the beginning)
Please refer to that function for the details of those returned dicts : selection, cases, holes,
coverage_issues, no_realization
Dict 'selection' is the main result, and documents all usable data by the grid,
version and period covered
We basically loop calling select_models_data_versions() on experiments and variables per table, and
we add entries for a compound variable 'P-E' for each case where we have variable 'pr' in table 'Amon'
and also an entry for 'evspsbl' with the same realization index as 'pr'. We also check then
that data period for 'evspsbl' is OK vs. 'pr' one
Dict 'cases' is the only one not not written as json file
"""
if outfile :
import sys
f = open(data_versions_dir+"/"+data_versions_tag+".txt", 'w')
orig_stdout = sys.stdout
sys.stdout = f
#
select=dict() ; all_cases=dict() ; all_holes=dict(); all_coverages=dict(); noreals=dict()
clog('error')
for experiment in experiments:
for table in variables :
for variable in variables[table] :
cases,selected,holes,coverages,noreal=select_models_data_versions(experiment,
variable,table,models=models)
feed_dic(all_cases ,cases ,experiment,variable,table)
feed_dic(select ,selected ,experiment,variable,table)
feed_dic(all_holes ,holes ,experiment,variable,table)
feed_dic(all_coverages ,coverages,experiment,variable,table)
feed_dic(noreals ,noreal ,experiment,variable,table)
#
#
# Maybe call check_realization_mismatch and chek_grid_mismatch here...
if outfile :
sys.stdout = orig_stdout
f.close()
add_entries_for_P_E(select)
# Write result as json
write_versions_dic(select,data_versions_tag)
write_versions_dic(all_holes,data_versions_tag+"_holes")
write_versions_dic(all_coverages,data_versions_tag+"_resolve")
write_versions_dic(noreals, data_versions_tag+"_noreals")
return select, all_cases, all_holes, all_coverages, noreals
def explain(variable, table, experiment,dics,
print_select=True, print_holes=True, print_coverage=True, print_novar=True, explicit=True) :
#
select, all_versions, all_holes, all_coverages, noreals = dics
if explicit :
prefix="%10s %10s %4s "%(experiment, variable, table)
else :
prefix=""
print
if not explicit :print prefix+"\n"+30*"*"
for model in select[experiment][variable][table]:
grid,real,v,period=select[experiment][variable][table][model]
if experiment != "piControl" : period=""
if print_select :
print "%-20s "%model+prefix+"OK ",grid,real,v,period
for model in noreals[experiment][variable][table]:
reason_found=False
try :
for grid in all_holes[experiment][variable][table][model] :
for real in all_holes[experiment][variable][table][model][grid]:
for v in all_holes[experiment][variable][table][model][grid][real] :
a=eval(all_holes[experiment][variable][table][model][grid][real][v])
if print_holes :
holes=`a.explore('choices')['period']`
if len(holes) > 80 : holes=holes[0:80]+"....."
print "%-20s "%model+prefix+"holes ",grid,real,v,": ",holes
reason_found=True
except :
pass
try :
for grid in all_coverages[experiment][variable][table][model] :
for real in all_coverages[experiment][variable][table][model][grid]:
for v in all_coverages[experiment][variable][table][model][grid][real] :
a=all_coverages[experiment][variable][table][model][grid][real][v]
if print_coverage :
print "%-20s "%model+prefix+"duration ",grid,real,v,": ",a
reason_found=True
except :
pass
if not reason_found :
try :
case=all_versions[experiment][variable][table][model]
except :
if print_novar :
print "%-20s "%model+prefix+"no_var"
def explain_sorted(variable, table, experiment,dics,explicit=True) :
explain(variable, table, experiment,dics, print_select=True, print_holes=False, print_coverage=False, print_novar=False, explicit=explicit)
explain(variable, table, experiment,dics, print_select=False, print_holes=True, print_coverage=False, print_novar=False, explicit=explicit)
explain(variable, table, experiment,dics, print_select=False, print_holes=False, print_coverage=True, print_novar=False, explicit=explicit)
explain(variable, table, experiment,dics, print_select=False, print_holes=False, print_coverage=False, print_novar=True, explicit=explicit)
#explain_sorted("sos","Omon","piControl",dics,explicit=True)
#explain_sorted("sos","Omon","piControl",dics,explicit=True)
def check_versions_are_latest(d):
print "Versions which are not the latest :"
for experiment in d:
for variable in d[experiment] :
if variable=="P-E" : continue
for table in d[experiment][variable]:
for model in d[experiment][variable][table] :
for r in d[experiment][variable][table][model] :
g,v,p=d[experiment][variable][table][model][r]
search="/bdd/CMIP6/%s/%s/%s/%s/%s/%s/%s/%s/*"%(
mip_for_experiment(experiment),institute_for_model(model),model,experiment,r,table,variable,g)
versions=glob.glob(search)
versions=[ ve.split("/")[-1] for ve in versions ]
versions.sort()
if len(versions)==0 :
print "For %s, %s and %s, no version with %s"%(experiment,variable,model,search)
continue
if v != versions[-1] :
print "For %s, %s %s and %s, version is not the latest (%s,%s). See %s"%(experiment,variable,model,r,v,versions[-1],search)
#else :
# print "OK for %s, %s and %s, %s)"%(experiment,variable,model,v)
print "Done"
def check_evspsbl_sign(d) :
means=dict()
for experiment in d:
#print experiment
for variable in ["evspsbl"] :
#print "\t",variable
if variable not in d[experiment] :
continue
for table in d[experiment][variable]:
for model in d[experiment][variable][table] :
realizations = d[experiment][variable][table][model].keys()
for realization in [realizations[0]] :
#print model,realization
g,v,p=d[experiment][variable][table][model][realization]
fyear=p.split('-')[0]
table="Amon"
if variable in ["mrro","mrso"] : table="Lmon"
dat=ds(project="CMIP6", experiment=experiment,model=model, institute="*",
period=fyear+"01", variable=variable, table=table, grid=g,
version=v, mip=mip_for_experiment(experiment),realization=realization )
mean=cvalue(ccdo(dat,operator="fldmean"))
if model not in means : means[model]=dict()
means[model][experiment]=mean
if mean < 0. :
#print"\t\t%20s"%model,r,v
print"%20s %20s %20s"%(experiment, variable,model),realization,v
def check_grids_mismatch(d):
mismatches=[]
for experiment in d:
grids=dict()
for variable in d[experiment] :
print variable
#if experiment=="piControl" :
# continue
for model in d[experiment][variable] :
for realization in d[experiment][variable][model] :
g,v,p=d[experiment][variable][model][realization]
feed_dic(grids,g,model,experiment)
for model in grids :
rs=set(grids[model].values())
if len(rs) > 1 :
print "\t%20s %s has varied grids "%(model,variable),rs,grids[model]
mismatches.append((model,variable))
return mismatches
#dics = create_versions_dictionnary(experiments,variables={"Amon":["pr","evspsbl"]},outfile=False,models=["CNRM-CM6-1"])
dics = create_versions_dictionnary(experiments,variables,outfile=False)#,models=["CNRM-CM6-1"])
select, all_versions, resolve_issues, all_holes, noreals = dics
print "Check versions are latest"
check_versions_are_latest(select)
#print "\nCheck grid mismatch"
#check_grids_mismatch(select)
print "\nCheck sign of evspsbl"
check_evspsbl_sign(select)
if print_all :
print noreals
if print_all:
print all_holes
if print_all :
print all_coverages