CAMMAC https://cammac.readthedocs.io

S.Sénési for Météo-France - sept 2019 to march 2021

Query the ESGF errata service for datasets of a data versions dictionnary (only lowest realization number), and organizes the reported issues by variable, severity and description

In [ ]:
data_versions_tag         = "20210201_derived"
data_versions_dir         = "/home/ssenesi/CAMMAC/select_data_versions"
In [ ]:
from IPython.core.display import display, HTML, Image
display(HTML("<style>.container { width:100% !important; }</style>"))
import requests  # use pip or conda to install it if needed
import json
import sys
In [ ]:
from CAMMAClib.ancillary  import feed_dic
from CAMMAClib.mips_et_al import institute_for_model, mip_for_experiment,\
    models_for_experiments,read_versions_dictionnary, prefered_variant 
In [ ]:
def errata(dataset_drs,base_url="https://errata.es-doc.org/1/"):
    """
    Query the errata service for erratas on a dataset DRS, such as
    >>> dataset_drs="CMIP6.DAMIP.NASA-GISS.GISS-E2-1-G.hist-sol.r1i1p1f1.AERmon.bldep.gn.v20180912"
    and returns a list of pairs (severity, description) for relevant erratas
    """
    erratas=[]    
    resolve_url=base_url+"resolve/simple-pid?datasets="+dataset_drs
    r=requests.get(resolve_url)
    #print resolve_url
    try :
        r=r.json()
    except ValueError :
        print "\nNo Json object for "+dataset_drs
        return None
    if 'errorCode' not in r :
        for handle in r :
            #print handle
            #print r[handle]['errataIds']   
            #print type(r[handle]['errataIds'])
            l=r[handle]['errataIds']
            if type(l) != type([]) :
                l=eval(l)
            for uid in l :
                #print uid
                e=requests.get(base_url+"issue/retrieve?uid="+uid).json()['issue']
                erratas.append((e['severity'],e['description']))
    else :
        #print "No entry for "+ds
        return None
    return erratas

A simple example of query

In [ ]:
errata('CMIP6.DAMIP.NASA-GISS.GISS-E2-1-G.hist-sol.r1i1p1f1.AERmon.bldep.gn.v20180912')
In [ ]:
errata('CMIP6.CMIP.THU.CIESM.historical.r1i1p1f1.Amon.pr.gr.v20200417')
In [ ]:
errata('CMIP6.ScenarioMIP.THU.CIESM.ssp585.r1i1p1f1.Amon.tas.gr.v20200417')

A function for querying the erratas for a versions dict, and organizing the outputs in a dict berrata2models[rvariable][severity][description] which values are lists of model.experiment

In [ ]:
def analyze_erratas(dic,experiments=None,variables=None, max_count=None, do_print=True, print_expids=False ) :
    """
    Use a data versions dictionnary such as produced by notebook select_data_versions and
    query the ESGF errata service for all corresponding datasets
    
    Returns a dictionnary of expid DRS with an errata, grouped that way :
    
    >>>  d[variable][severity][errata_description] = [ ... set of expid DRS ...]
    
    Arg dic is a data versions dictionnary organized that way : 
       data_versions[expid][variable][table][model][variant]=(grid,version,data_period)
    Arg variable allows to restrict the analysis to those metadata lines which are for a given variable
    Arg max_count allows to restrict the number of processed cases
          
    """
    errata_base_url="https://errata.es-doc.org/1/"
    count=0
    berrata2models=dict()
    already_done=[]
    if experiments is None :
        experiments=dic.keys()
    for experiment in experiments :
        print experiment,
        if variables is None :
            variables=dic[experiment].keys()
        #print variables
        for variable in variables :
            print variable,
            for table in dic[experiment][variable] :
                for model in dic[experiment][variable][table] :
                    variants=set(dic[experiment][variable][table][model].keys())
                    variant=prefered_variant(variants,"",model)               
                    if variant is None :
                        raise ValueError("Issue with prefered variant for %s %s %s %s"%(experiment,variable,model,variants) )
                    grid,version,data_period = dic[experiment][variable][table][model][variant]
                    #CMIP6.CMIP.MPI-M.MPI-ESM1-2-HR.piControl
                    expid="CMIP6.%s.%s.%s.%s"%(mip_for_experiment(experiment),institute_for_model(model),model,experiment)
                    nuple=(expid,variant,table,variable,grid,version)
                    if nuple not in already_done :
                        already_done.append(nuple)
                        #print "processing ",nuple
                        drs="%s.%s.%s.%s.%s.%s"%(expid,variant,table,variable,grid,version)
                        #print drs
                        #continue
                        count +=1
                        err_list=errata(drs,errata_base_url)
                        #print err_list
                        if err_list is not None :
                            if variable not in berrata2models :
                                berrata2models[variable]=dict()
                            for severity,description in err_list :
                                if severity not in berrata2models[variable]:
                                    berrata2models[variable][severity]=dict()
                                if description not in berrata2models[variable][severity] :
                                    berrata2models[variable][severity][description]=set()
                                expid_short=expid.split(".")[3]+"."+expid.split(".")[4]
                                berrata2models[variable][severity][description].add(expid_short)
                        if max_count is not None and count > max_count :
                            break
            print

    print
    
    for variable in berrata2models :
        for severity in berrata2models[variable] :
            for description in berrata2models[variable][severity] :
                    expids=berrata2models[variable][severity][description]
                    berrata2models[variable][severity][description]=list(expids)

    print "%d distinct cases scrutinized"%len(already_done)
    if do_print :
        print_errata2models(berrata2models,print_expids)
    #
    from datetime import datetime
    berrata2models["Errata service query date"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    berrata2models["Errata service query url"]  = errata_base_url
    #
    return berrata2models
In [ ]:
def print_errata2models(berrata2models,print_expids=True,severities=["high","medium"]):
    for variable in berrata2models :
        if "Errata" in variable : continue
        print "\nvariable",variable
        for severity in berrata2models[variable] :
            if severity not in severities : continue
            print "\n\tseverity",severity #, berrata2models[variable][severity]
            for description in berrata2models[variable][severity] :
                expids=berrata2models[variable][severity][description]
                if print_expids :
                    print "\n\t\t",description,expids
                else:
                    print "\n\t\t",description,len(expids)

A small scale example of using analyze_erratas

In [ ]:
data_versions=read_versions_dictionnary(data_versions_tag,data_versions_dir)
In [ ]:
a=analyze_erratas(data_versions,experiments=["historical"],variables=["pr"],do_print=False)#,max_count=1)
In [ ]:
print_errata2models(a,print_expids=True,severities=["medium","high"])

Let us run the analysis for the whole dict

In [ ]:
allvars=dict()
for variable in [u'pr', u'tas', u'mrro', u'evspsbl', u'mrso', u'P-E', u'prw', u'mrsos', u'sos'] :
    allvars[variable]=analyze_erratas(data_versions,do_print=False,variables=[variable])
    print_errata2models(allvars[variable],print_expids=True,severities=["medium","high"])

Let us save the result

In [ ]:
jsfile="all_erratas_%s.json"%data_versions_tag
#a["files"]=l
a["doc"]="list_of_model.experiment[variable][severity][description]"
with open(jsfile,"w") as f :
        json.dump(a,f,separators=(',', ': '),indent=3,ensure_ascii=True)