Source code for aneris._io

"""Provides helper functions for reading input data and configuration files.

The default configuration values are provided in aneris.RC_DEFAULTS.
"""
import collections
import os
import yaml

import pandas as pd

from aneris.utils import isstr, isnum, iamc_idx

RC_DEFAULTS = """
config:
    default_luc_method: reduce_ratio_2150_cov
    cov_threshold: 20
    harmonize_year: 2015
prefix: CEDS+|9+ Sectors
suffix: Unharmonized
add_5regions: true
"""


def _read_data(indfs):
    datakeys = sorted([x for x in indfs if x.startswith('data')])
    df = pd.concat([indfs[k] for k in datakeys])
    # don't know why reading from excel changes dtype and column types
    # but I have to reset them manually
    df.columns = df.columns.astype(str)
    numcols = [x for x in df.columns if isnum(x)]
    df[numcols] = df[numcols].astype(float)

    # some teams also don't provide standardized column names and styles
    df.columns = df.columns.str.capitalize()

    return df


def _recursive_update(d, u):
    for k, v in u.items():
        if isinstance(v, collections.Mapping):
            r = _recursive_update(d.get(k, {}), v)
            d[k] = r
        else:
            d[k] = u[k]
    return d


[docs]def pd_read(f, str_cols=False, *args, **kwargs): """Try to read a file with pandas, supports CSV and XLSX Parameters ---------- f : string the file to read in str_cols : bool, optional turn all columns into strings (numerical column names are sometimes read in as numerical dtypes) args, kwargs : sent directly to the Pandas read function Returns ------- df : pd.DataFrame """ if f.endswith('csv'): df = pd.read_csv(f, *args, **kwargs) else: df = pd.read_excel(f, *args, **kwargs) if str_cols: df.columns = [str(x) for x in df.columns] return df
[docs]def pd_write(df, f, *args, **kwargs): """Try to write a file with pandas, supports CSV and XLSX""" # guess whether to use index, unless we're told otherwise index = kwargs.pop('index', isinstance(df.index, pd.MultiIndex)) if f.endswith('csv'): df.to_csv(f, index=index, *args, **kwargs) else: writer = pd.ExcelWriter(f, engine='xlsxwriter') df.to_excel(writer, index=index, *args, **kwargs) writer.save()
[docs]def read_excel(f): """Read an excel-based input file for harmonization. Parameters ---------- f : string path to input file Returns ------- model : pd.DataFrame model data frame in IAMC format overrides : pd.DataFrame overrides data frame in IAMC format config : dictionary configuration overrides (if any) """ indfs = pd_read(f, sheetname=None, encoding='utf-8') model = _read_data(indfs) # make an empty df which will be caught later overrides = indfs['harmonization'] if 'harmonization' in indfs \ else pd.DataFrame([], columns=iamc_idx + ['Unit']) # get run control config = {} if'Configuration' in overrides: config = overrides[['Configuration', 'Value']].dropna() config = config.set_index('Configuration').to_dict()['Value'] overrides = overrides.drop(['Configuration', 'Value'], axis=1) # a single row of nans implies only configs provided, # if so, only return the empty df if len(overrides) == 1 and overrides.isnull().values.all(): overrides = pd.DataFrame([], columns=iamc_idx + ['Unit']) return model, overrides, config
[docs]class RunControl(collections.Mapping): """A thin wrapper around a Python Dictionary to support configuration of harmonization execution. Input can be provided as dictionaries or YAML files. """
[docs] def __init__(self, rc=None, defaults=None): """ Parameters ---------- rc : string, file, dictionary, optional a path to a YAML file, a file handle for a YAML file, or a dictionary describing run control configuration defaults : string, file, dictionary, optional a path to a YAML file, a file handle for a YAML file, or a dictionary describing **default** run control configuration """ rc = rc or {} defaults = defaults or RC_DEFAULTS rc = self._load_yaml(rc) defaults = self._load_yaml(defaults) self.store = _recursive_update(defaults, rc)
def __getitem__(self, k): return self.store[k] def __iter__(self): return iter(self.store) def __len__(self): return len(self.store) def __repr__(self): return self.store.__repr__() def _get_path(self, key, fyaml, fname): if os.path.exists(fname): return fname _fname = os.path.join(os.path.dirname(fyaml), fname) if not os.path.exists(_fname): msg = "YAML key '{}' in {}: {} is not a valid relative " + \ "or absolute path" raise IOError(msg.format(key, fyaml, fname)) return _fname def _fill_relative_paths(self, fyaml, d): file_keys = [ 'exogenous', ] for k in file_keys: if k in d: d[k] = [self._get_path(k, fyaml, fname) for fname in d[k]] def _load_yaml(self, obj): check_rel_paths = False if hasattr(obj, 'read'): # it's a file obj = obj.read() if isstr(obj) and os.path.exists(obj): check_rel_paths = True fname = obj with open(fname) as f: obj = f.read() if not isinstance(obj, dict): obj = yaml.load(obj) if check_rel_paths: self._fill_relative_paths(fname, obj) return obj
[docs] def recursive_update(self, k, d): """Recursively update a top-level option in the run control Parameters ---------- k : string the top-level key d : dictionary or similar the dictionary to use for updating """ u = self.__getitem__(k) self.store[k] = _recursive_update(u, d)