Iterate over scenarios without any initialisation

The intention is to replace the current config, sweep, and context modules with a simpler workflow that divides this process into several discrete stages. This first stage supports iterating over combinations of observation model parameters, with each "instance" being returned as a type that can be pickled and sent to another Python process for construction and validation (i.e., to be turned into a Context object).

Iterate over scenarios without any initialisation
e3349459 · Rob Moss · d66b97b6 · e3349459 · e3349459 · e3349459
Commit e3349459 authored 3 years ago by Rob Moss
--- a/doc/api/index.rst
+++ b/doc/api/index.rst
@@ -32,6 +32,7 @@ table), while others are likely of no use outside of pypfilt_ (see the
   :mod:`pypfilt`           Provides model-fitting and forecasting functions
   :mod:`pypfilt.config`    Reads forecast scenarios from TOML_ files
   :mod:`pypfilt.sweep`     Iterates over forecast scenarios
+   :mod:`pypfilt.scenario`    Reads forecast scenarios from TOML_ files
   :mod:`pypfilt.model`     Defines the simulation model base class
                            :class:`~pypfilt.model.Model`
   :mod:`pypfilt.obs`       Defines the observation model base class
@@ -75,6 +76,7 @@ table), while others are likely of no use outside of pypfilt_ (see the
   pypfilt
   config
   sweep
+   scenario
   model
   obs
   params

--- a/doc/api/scenario.rst
+++ b/doc/api/scenario.rst
+pypfilt.scenario
+================
+
+.. py:module:: pypfilt.scenario
+
+The :mod:`pypfilt.scenario` module reads simulation scenarios from plain-text TOML_ inputs.
+
+The purpose of this module is to allow users to define and run simulations **without writing any Python code**, and instead define all of the necessary settings in TOML_ files.
+
+.. note:: A scenario will have a separate :py:class:`Instance` for each combination of observation model parameter values.
+
+Loading scenarios
+-----------------
+
+.. autofunction:: load_instances
+
+.. autoclass:: Instance
+
+Internal types
+--------------
+
+.. autoclass:: Specification
+
+.. autoclass:: Scenario
+
+.. autoclass:: ObsModelParams
+
+Internal functions
+------------------
+
+.. autofunction:: load_toml
+
+.. autofunction:: load_specifications
+
+.. autofunction:: scenarios
+
+.. autofunction:: instances
+
+.. autofunction:: observation_model_parameter_combinations
+
+.. autofunction:: scenario_observation_model_combinations
+
+.. autofunction:: scenario_observation_model_parameters
+
+.. autofunction:: override_dict
+
+.. autofunction:: as_list
--- a/src/pypfilt/scenario.py
+++ b/src/pypfilt/scenario.py
+"""
+Provides a declarative means of defining estimation and forecasting scenarios.
+
+The purpose of this module is to allow users to define and run simulations
+**without writing any Python code**, and instead define all of the necessary
+settings in `TOML`_ files.
+"""
+
+import copy
+import itertools
+import tomli
+from typing import Any, Dict, NamedTuple, Optional
+
+
+class Specification(NamedTuple):
+    """
+    A specification that defines any number of scenarios.
+
+    :param global_settings: Default settings for all scenarios.
+    :type global_settings: Dict[str, Any]
+    :param scenario_settings: Settings specific to single scenarios.
+        This is a dictionary that maps the setting ID to the settings that are
+        specific to the identified scenario.
+    :type scenario_settings: Dict[str, Any]
+    :param source: The (optional) TOML input for this specification.
+    :type source: Optional[str]
+    """
+    global_settings: Dict[str, Any]
+    scenario_settings: Dict[str, Any]
+    source: Optional[str]
+
+
+class Scenario(NamedTuple):
+    """
+    The definition of a single scenario.
+
+    :param scenario_id: The unique identifier for this scenario.
+    :type scenario_id: str
+    :param settings: The settings dictionary, which defines all
+        of the simulation components and parameters.
+    :type settings: Dict[str, Any]
+    :param source: The (optional) TOML input for this specification.
+    :type source: Optional[str]
+    """
+    scenario_id: str
+    settings: Dict[str, Any]
+    source: Optional[str]
+
+
+class Instance(NamedTuple):
+    """
+    A single instance of a scenario.
+
+    :param scenario_id: The scenario identifier for this instance.
+    :type scenario_id: str
+    :param settings: The settings dictionary, which defines all
+        of the simulation components and parameters, including any that are
+        specific to this instance.
+    :type settings: Dict[str, Any]
+    :param descriptor: The identifier descriptor, which describes the
+        observation model parameter values for this specific instance.
+    :type descriptor: str
+    :param source: The (optional) TOML input for this specification.
+    :type source: Optional[str]
+    """
+    scenario_id: str
+    settings: Dict[str, Any]
+    descriptor: str
+    source: Optional[str]
+
+    def __str__(self):
+        fmt = 'Instance(scenario_id="{}", descriptor="{}")'
+        return fmt.format(self.scenario_id, self.descriptor)
+
+    def __repr__(self):
+        """
+        The goal of ``__repr__`` is to produce *unambiguous* output, while the
+        goal of ``__str__`` is to produce *readable* output.
+
+        In this case, these two methods can return the same output because the
+        scenario ID and instance descriptor uniquely identify a specific
+        instance of a specific scenario.
+        """
+        return str(self)
+
+
+class ObsModelParams(NamedTuple):
+    """
+    Describes the parameter values for an observation model, and how to format
+    the parameter names and values into an instance descriptor.
+
+    :param unit: The observation unit, which is a unique identifier for this
+        observation model and the observations to which it pertains.
+    :type unit: str
+    :param values: The parameter values for this observation model.
+    :type values: Dict[str, Any]
+    :param value_format: The format strings used to convert parameter values
+        into strings.
+    :type value_format: Dict[str, str]
+    :param display_names: The strings used to represent each parameter in
+        instance descriptors.
+    :type display_names: Dict[str, str]
+    """
+    unit: str
+    values: Dict[str, Any]
+    value_format: Dict[str, str]
+    display_names: Dict[str, str]
+
+
+def load_instances(sources):
+    """
+    Iterate over scenario instances defined in one or more `TOML`_ sources.
+
+    :param sources: A list of file-like objects and/or file paths.
+        If ``sources`` is not a list, it will be treated as the only item of a
+        list.
+
+    :rtype: Iterator[Instance]
+    """
+    for spec in load_specifications(sources):
+        for scenario in scenarios(spec):
+            for instance in instances(scenario):
+                # NOTE: this is where the job of this module ends,
+                # and the job of Context begins.
+                yield instance
+
+
+def load_toml(source):
+    """
+    Read `TOML`_ content from ``source`` and return the parsed dictionary and
+    the `TOML`_ input.
+
+    :param source: A file-like object or a file path.
+    :return: A ``(dict, str)`` tuple.
+    """
+    if hasattr(source, 'read'):
+        toml_string = source.read()
+    else:
+        with open(source, encoding='utf-8') as f:
+            toml_string = f.read()
+
+    parsed_dict = tomli.loads(toml_string)
+    return (parsed_dict, toml_string)
+
+
+def load_specifications(sources):
+    """
+    Iterate over the scenario specifications in ``sources``.
+
+    :param sources: A list of file-like objects and/or file paths.
+        If ``sources`` is not a list, it will be treated as a list containing
+        one item.
+
+    :rtype: Iterator[Specification]
+
+    :raises ValueError: if a source does not define any scenarios.
+    """
+    sources = as_list(sources)
+
+    for source in sources:
+        (source_dict, toml_string) = load_toml(source)
+
+        if 'scenario' not in source_dict:
+            raise ValueError('No scenarios defined in {}'.format(source))
+
+        scenarios_table = source_dict['scenario']
+        del source_dict['scenario']
+
+        spec = Specification(
+            global_settings=source_dict,
+            scenario_settings=scenarios_table,
+            source=toml_string,
+        )
+        yield spec
+
+
+def scenarios(spec):
+    """
+    Iterate over the scenarios in the provided specification ``spec``.
+
+    :param spec: The scenario specifications.
+    :type spec: Specification
+
+    :rtype: Iterator[Scenario]
+    """
+    for (scenario_id, scenario_dict) in spec.scenario_settings.items():
+        # Construct the scenario settings by applying scenario-specific
+        # settings on top of the global settings.
+        global_dict = copy.deepcopy(spec.global_settings)
+        scenario_dict = copy.deepcopy(scenario_dict)
+        settings = override_dict(global_dict, scenario_dict)
+
+        scenario = Scenario(
+            scenario_id=scenario_id,
+            settings=settings,
+            source=spec.source,
+        )
+        yield scenario
+
+
+def instances(scenario):
+    """
+    Iterate over the instances of a single scenario.
+
+    :param scenario: The scenario definition.
+    :type scenario: Scenario
+
+    :rtype: Iterator[Instance]
+    """
+    # Iterate over every combination of observation model parameter values.
+    previous_descriptors = set()
+    obs_combs = scenario_observation_model_combinations(scenario)
+    for (value_dicts, descriptor) in obs_combs:
+
+        # First ensure that the descriptor is unique.
+        if descriptor in previous_descriptors:
+            msg_fmt = 'Scenario "{}" has a duplicate descriptor "{}"'
+            raise ValueError(msg_fmt.format(scenario.scenario_id, descriptor))
+        previous_descriptors.add(descriptor)
+
+        # Copy the scenario settings, and apply the parameter values for each
+        # observation model.
+        settings = copy.deepcopy(scenario.settings)
+        for (obs_unit, values) in value_dicts.items():
+            settings['observations'][obs_unit]['parameters'] = values
+
+        # Return this instance of the scenario.
+        instance = Instance(
+            scenario_id=scenario.scenario_id,
+            settings=settings,
+            descriptor=descriptor,
+            source=scenario.source,
+        )
+        yield instance
+
+
+def observation_model_parameter_combinations(obs_params):
+    """
+    Iterate over every combination of parameter values for a single
+    observation model.
+
+    Each combination is returned as a ``(unit, values, descriptor)`` tuple.
+
+    :param obs_params: The observation model parameters definition.
+    :type obs_params: ObsModelParams
+
+    :rtype: Iterator[tuple[str, Dict[str, float | int], str]]
+    """
+    # NOTE: sort parameters by name to ensure a consistent ordering.
+    names = sorted(obs_params.values.keys())
+
+    # Create a format string for each parameter.
+    # For example, if the 'bg_obs' parameter has the display name 'bg', the
+    # format string will be "bg-{val[0]:{fmt[bg_obs]}}".
+    out_fields = []
+    for (ix, name) in enumerate(names):
+        # NOTE: produce format strings such as .
+        field = '{0}-{{values[{1}]:{{formats[{2}]}}}}'.format(
+            obs_params.display_names[name], ix, name)
+        out_fields.append(field)
+
+    # Join the format strings into a single format string for all parameters.
+    out_fmt = '-'.join(out_fields)
+
+    # NOTE: the parameters must be scanned in their listed order, so that the
+    # order of the values matches that of the indices in the format string.
+    scan = [as_list(obs_params.values[name]) for name in names]
+    for parameter_values in itertools.product(*scan):
+        values_dict = dict(zip(names, parameter_values))
+        descriptor = out_fmt.format(values=parameter_values,
+                                    formats=obs_params.value_format)
+        yield (obs_params.unit, values_dict, descriptor)
+
+
+def as_list(values):
+    """
+    Return values as a list.
+
+    :param values: A list of values, or a value that will be returned as the
+        only item of the returned list.
+    :type values: Union[list[Any], Any]
+
+    :rtype: list[Any]
+    """
+    if isinstance(values, list):
+        return values
+    else:
+        return [values]
+
+
+def scenario_observation_model_combinations(scenario):
+    """
+    Iterate over every combination of parameter values for each observation
+    model.
+
+    Each combination is returned as a ``(values, descriptor)`` tuple, where
+    ``values`` is a dictionary that maps each observation model (identified by
+    observation unit) to the
+    parameter values for that observation model.
+
+    :rtype: Iterator[tuple[Dict[str, Any], str]]
+    """
+    # NOTE: if the scenario has no observation models, return an empty
+    # configuration dictionary and an empty descriptor string.
+    if 'observations' not in scenario.settings:
+        yield ({}, "")
+        return
+
+    obs_models = scenario_observation_model_parameters(scenario)
+    obs_model_values = [
+        observation_model_parameter_combinations(obs_model)
+        for obs_model in obs_models.values()
+    ]
+    for obs_model_comb in itertools.product(*obs_model_values):
+        # NOTE: each element is (unit, values_dict, descriptor)
+        descriptors = [descr for (_unit, _values, descr) in obs_model_comb]
+        descriptor = '-'.join(descriptors)
+        obs_config = {
+            unit: values
+            for (unit, values, _descr) in obs_model_comb
+        }
+        yield(obs_config, descriptor)
+
+
+def scenario_observation_model_parameters(scenario):
+    """
+    Return the parameter values for each observation model in a scenario,
+    where each observation model is identified by its observation unit.
+
+    :param scenario: The scenario definition.
+    :type scenario: Scenario
+
+    :rtype: Dict[str, ObsModelParams]
+
+    :raises ValueError: if the parameter names are not consistent across the
+        parameter values, the value format strings, and the parameter display
+        names.
+    """
+    obs_tables = scenario.settings['observations'].items()
+    obs_models = {
+        unit: ObsModelParams(
+            unit=unit,
+            values=om_dict['parameters'],
+            value_format=om_dict['format'],
+            display_names=om_dict['name'],
+        )
+        for (unit, om_dict) in obs_tables
+    }
+
+    # Ensure that the parameter values, format string, and display names all
+    # refer to the same set of parameters.
+    for om_params in obs_models.values():
+        value_keys = set(om_params.values.keys())
+        format_keys = set(om_params.value_format.keys())
+        names_keys = set(om_params.display_names.keys())
+        identical_keys = (
+            value_keys == format_keys
+            and format_keys == names_keys
+            and names_keys == value_keys)
+        if not identical_keys:
+            msg_fmt = 'Invalid "{}" observation model'
+            raise ValueError(msg_fmt.format(om_params.unit))
+
+    return obs_models
+
+
+def override_dict(defaults, overrides):
+    """
+    Override a dictionary with values in another dictionary. This will
+    recursively descend into matching nested dictionaries.
+
+    Where an override value is a dictionary, the corresponding default value
+    must be a dictionary in order for nested defaults to be propagated.
+    Otherwise, the default value is simply replaced by the override value.
+
+    :param dict defaults: The original values; note that this dictionary
+        **will be modified**.
+    :param dict overrides: The overriding values.
+    :return: The modified ``defaults`` dictionary.
+    :rtype: Dict[Any, Any]
+    """
+    for (key, value) in overrides.items():
+        if isinstance(value, dict):
+            if key in defaults and isinstance(defaults[key], dict):
+                # Override the nested default values.
+                sub_defaults = defaults[key]
+                defaults[key] = override_dict(sub_defaults, value)
+            else:
+                # Replace the default value with this dictionary.
+                defaults[key] = value
+        else:
+            defaults[key] = value
+    return defaults
--- a/tests/test_scenario_instances.py
+++ b/tests/test_scenario_instances.py
+"""
+Test that scenario instances are generated as expected from TOML content.
+"""
+
+import io
+import pypfilt.scenario
+import pytest
+
+
+def test_scenario_empty_toml():
+    """
+    Test that a ValueError is raised when no scenarios are defined.
+    """
+    toml_input = """
+    """
+    source = io.StringIO(toml_input)
+    with pytest.raises(ValueError):
+        _ = list(pypfilt.scenario.load_instances(source))
+
+
+def test_scenario_single_instance():
+    """
+    Test that we obtain a single instance from this minimal TOML input.
+    """
+    toml_input = """
+    global = true
+    hello = "world"
+
+    [scenario.test]
+    global = false
+    local = true
+    """
+    source = io.StringIO(toml_input)
+    instances = list(pypfilt.scenario.load_instances(source))
+    assert len(instances) == 1
+
+    # Check that global and scenario-specific parameters have been applied.
+    instance = instances[0]
+    assert instance.scenario_id == 'test'
+    assert instance.descriptor == ''
+
+    assert len(instance.settings) == 3
+    assert 'global' in instance.settings
+    assert instance.settings['global'] == False
+    assert 'hello' in instance.settings
+    assert instance.settings['hello'] == 'world'
+    assert 'local' in instance.settings
+    assert instance.settings['local'] == True
+
+
+def test_scenario_many_observation_models():
+    """
+    Test that we obtain multiple instances for a single scenario.
+    """
+    toml_input = """
+
+    [scenario.test]
+
+    [scenario.test.observations.x]
+    parameters.bg_obs = 1
+    parameters.pr_obs = [0.1, 0.2, 0.5]
+    parameters.disp = 10
+    format = { bg_obs = "03.0f", pr_obs = "0.1f", disp = "03.0f" }
+    name = { bg_obs = "bg", pr_obs = "pr", disp = "disp" }
+
+    [scenario.test.observations.y]
+    parameters.bg_obs = 2
+    parameters.pr_obs = 0.8
+    parameters.disp = [100, 1000]
+    format = { bg_obs = "03.0f", pr_obs = "0.1f", disp = "04.0f" }
+    name = { bg_obs = "bg", pr_obs = "pr", disp = "disp" }
+    """
+    source = io.StringIO(toml_input)
+    instances = list(pypfilt.scenario.load_instances(source))
+    assert len(instances) == 6
+
+    # Check that each instance has the correct scenario ID.
+    assert all(inst.scenario_id == 'test' for inst in instances)
+
+    # Check that each instance descriptor is unique.
+    descriptors = set(inst.descriptor for inst in instances)
+    assert len(descriptors) == len(instances)
+
+    # Check that we have the expected number of instances for each of the
+    # x and y observation model parameters.
+
+    # Check the 'bg_obs' values are constant for each observation model.
+    assert all(
+        inst.settings['observations']['x']['parameters']['bg_obs'] == 1
+        for inst in instances)
+    assert all(
+        inst.settings['observations']['y']['parameters']['bg_obs'] == 2
+        for inst in instances)
+
+    # Check the 'disp' values vary as expected.
+    x_disp_10 = [
+        inst for inst in instances
+        if inst.settings['observations']['x']['parameters']['disp'] == 10]
+    assert len(x_disp_10) == len(instances)
+
+    y_disp_100 = [
+        inst for inst in instances
+        if inst.settings['observations']['y']['parameters']['disp'] == 100]
+    assert len(y_disp_100) == 3
+
+    y_disp_1000 = [
+        inst for inst in instances
+        if inst.settings['observations']['y']['parameters']['disp'] == 1000]
+    assert len(y_disp_1000) == 3
+
+    # Check the 'pr_obs' values vary as expected.
+    y_pr_08 = [
+        inst for inst in instances
+        if inst.settings['observations']['y']['parameters']['pr_obs'] == 0.8]
+    assert len(y_pr_08) == 6
+
+    x_pr_01 = [
+        inst for inst in instances
+        if inst.settings['observations']['x']['parameters']['pr_obs'] == 0.1]
+    assert len(x_pr_01) == 2
+
+    x_pr_02 = [
+        inst for inst in instances
+        if inst.settings['observations']['x']['parameters']['pr_obs'] == 0.2]
+    assert len(x_pr_02) == 2
+
+    x_pr_05 = [
+        inst for inst in instances
+        if inst.settings['observations']['x']['parameters']['pr_obs'] == 0.5]
+    assert len(x_pr_05) == 2
+
+
+def test_scenario_multiple_scenarios():
+    """
+    Test that we obtain a single instance for each scenario.
+    """
+    toml_input = """
+    global = { foo = "hello", extra = "hi" }
+    default = 1
+    hello = "world"
+
+    [scenario.foo]
+    global = { foo = "goodbye" }
+    default = { a = "yes", b = "no" }
+    local = true
+
+    [scenario.bar]
+    global = { bar = "world" }
+    local = true
+    """
+    source = io.StringIO(toml_input)
+    instances = list(pypfilt.scenario.load_instances(source))
+    assert len(instances) == 2
+
+    foos = list(filter(lambda i: i.scenario_id == 'foo', instances))
+    bars = list(filter(lambda i: i.scenario_id == 'bar', instances))
+    assert len(foos) == 1
+    assert len(bars) == 1
+    foo = foos[0]
+    bar = bars[0]
+
+    # Check that global and scenario-specific parameters have been applied.
+    assert foo.descriptor == ''
+    assert bar.descriptor == ''
+
+    assert len(foo.settings) == 4
+    assert len(bar.settings) == 4
+    assert foo.settings['hello'] == 'world'
+    assert bar.settings['hello'] == 'world'
+    assert foo.settings['default'] == {'a': 'yes', 'b': 'no'}
+    assert bar.settings['default'] == 1
+    assert foo.settings['local'] == True
+    assert bar.settings['local'] == True
+    assert foo.settings['global'] == {'foo': 'goodbye',
+                                      'extra': 'hi'}
+    assert bar.settings['global'] == {'foo': 'hello',
+                                      'bar': 'world',
+                                      'extra': 'hi'}
+
+
+def test_scenario_multiple_sources():
+    """
+    Test that we obtain instances from each source.
+    """
+    toml_input_a = """
+    [scenario.foo]
+    local = true
+    """
+
+    toml_input_b = """
+    [scenario.bar]
+    local = true
+    """
+    source_a = io.StringIO(toml_input_a)
+    source_b = io.StringIO(toml_input_b)
+    sources = [source_a, source_b]
+    instances = list(pypfilt.scenario.load_instances(sources))
+    assert len(instances) == 2
+
+    foos = list(filter(lambda i: i.scenario_id == 'foo', instances))
+    bars = list(filter(lambda i: i.scenario_id == 'bar', instances))
+    assert len(foos) == 1
+    assert len(bars) == 1
+    foo = foos[0]
+    bar = bars[0]
+
+    # Check that global and scenario-specific parameters have been applied.
+    assert foo.descriptor == ''
+    assert bar.descriptor == ''
+
+    assert len(foo.settings) == 1
+    assert len(bar.settings) == 1
+    assert foo.settings['local'] == True
+    assert bar.settings['local'] == True
+
+
+def test_scenario_many_scenarios_instances():
+    """
+    Test that we obtain multiple instances for each scenario.
+    """
+    toml_input = """
+
+    [observations.x]
+    parameters.bg_obs = 1
+    parameters.pr_obs = [0.1, 0.2, 0.5]
+    parameters.disp = 10
+    format = { bg_obs = "03.0f", pr_obs = "0.1f", disp = "03.0f" }
+    name = { bg_obs = "bg", pr_obs = "pr", disp = "disp" }
+
+    [scenario.single]
+
+    [scenario.multi]
+
+    [scenario.multi.observations.y]
+    parameters.bg_obs = 2
+    parameters.pr_obs = 0.8
+    parameters.disp = [100, 1000]
+    format = { bg_obs = "03.0f", pr_obs = "0.1f", disp = "04.0f" }
+    name = { bg_obs = "bg", pr_obs = "pr", disp = "disp" }
+
+    """
+    source = io.StringIO(toml_input)
+    instances = list(pypfilt.scenario.load_instances(source))
+    assert len(instances) == 9
+
+    singles = list(filter(lambda i: i.scenario_id == 'single', instances))
+    multis = list(filter(lambda i: i.scenario_id == 'multi', instances))
+    assert len(singles) == 3
+    assert len(multis) == 6
+
+    for inst in singles:
+        assert 'x' in inst.settings['observations']
+        assert len(inst.settings['observations']) == 1
+
+    for inst in multis:
+        assert 'x' in inst.settings['observations']
+        assert 'y' in inst.settings['observations']
+        assert len(inst.settings['observations']) == 2
+
+    # Check the 'pr_obs' values for 'x' vary as expected.
+    for x_pr in [0.1, 0.2, 0.5]:
+        single_matches = [
+            inst for inst in singles
+            if inst.settings['observations']['x']['parameters']['pr_obs']
+            == x_pr]
+        assert len(single_matches) == 1
+
+        multi_matches = [
+            inst for inst in multis
+            if inst.settings['observations']['x']['parameters']['pr_obs']
+            == x_pr]
+        assert len(multi_matches) == 2
+
+    # Check the 'disp' values for 'y' vary as expected.
+    for y_disp in [100, 1000]:
+        multi_matches = [
+            inst for inst in multis
+            if inst.settings['observations']['y']['parameters']['disp']
+            == y_disp]
+        assert len(multi_matches) == 3