Dataset generation

Script to generate the dataset for the project.

::: {#cell-3 .cell 0=‘e’ 1=‘x’ 2=‘p’ 3=‘o’ 4=‘r’ 5=‘t’}

from pathlib import Path
from omegaconf import DictConfig, OmegaConf, open_dict
import hydra
import numpy as np
from tqdm import tqdm
from typing import Optional, List, Union
from physmodjax.solver.generator import (
    generate_initial_condition,
    Generator,
    Gaussian,
    SineMode,
)
import os
import logging

:::

::: {#cell-4 .cell 0=‘e’ 1=‘x’ 2=‘p’ 3=‘o’ 4=‘r’ 5=‘t’}

def generate_run(
    rng: np.random.Generator,
    solver,
    generator: Generator = Gaussian(),
    **ic_params,
):
    u0, v0 = generate_initial_condition(
        rng,
        generator,
        **ic_params,
    )
    # solve
    return solver.solve(u0=u0, v0=v0)


@hydra.main(version_base=None, config_path="../../conf", config_name="generate_data")
def generate_dataset(
    cfg: DictConfig,
):
    print(OmegaConf.to_yaml(cfg, resolve=True))
    ic_params = getattr(cfg, "ic_params", {})
    # print(ic_params)

    solver = hydra.utils.instantiate(cfg.solver)
    generator = hydra.utils.instantiate(cfg.ic_generator)

    # hydra multirun flag
    hydra_multirun = (
        hydra.core.hydra_config.HydraConfig.get().mode == hydra.types.RunMode.MULTIRUN
    )

    number_ics = cfg.number_ics
    # If the generator is SineMode, we need to make sure that we are not generating modes with a k higher
    # than the number of grid points in the solver minus 2. Otherwise, we will get aliasing. Print warning about changing number_ics.
    if isinstance(generator, SineMode):
        number_ics = min(number_ics, solver.n_max_modes)
        if number_ics < cfg.number_ics:
            print(
                f"Warning: number_ics was changed from {cfg.number_ics} to {number_ics} to avoid modes not used in the solver when using SineMode."
            )

    rng = np.random.default_rng(cfg.seed)

    # To preserve backwards compatibility, we need to check if the config has ic_type or ic_max_amplitude at the base level
    # and set ic_params accordingly. Before, ic_max_amplitude being a float was assumed to indicate randomized amplitude.
    if hasattr(cfg, "ic_type") and ("ic_type" not in ic_params):
        ic_params["ic_type"] = cfg.ic_type
    if hasattr(cfg, "ic_max_amplitude"):
        ic_params["ic_max_amplitude"] = cfg.ic_max_amplitude
        ic_params["ic_amplitude_random"] = True

    # If hydra mode is RUN print the mode
    if hydra_multirun:
        logger = logging.getLogger("tqdm_logger")
        logger.setLevel(logging.INFO)
        logger.info(OmegaConf.to_yaml(cfg))
        progress_bar = tqdm(range(1, number_ics + 1), file=open(os.devnull, "w"))
    else:
        progress_bar = tqdm(range(1, number_ics + 1))
    # create initial conditions
    for i in progress_bar:
        if isinstance(generator, SineMode):
            with open_dict(ic_params):
                ic_params["ic_sine_k"] = int(i)
        t, u, v = generate_run(rng, solver, generator, **ic_params)

        ic = np.stack([u, v], axis=-1)

        # save
        file_name = f"ic_{i:05d}.npy"
        progress_bar.set_postfix({"Saved file": f"{file_name}"})

        if hydra_multirun:
            logger.info(str(progress_bar))

        # The convention for the data is:
        # (timesteps, grid_points, statevars)
        np.save(Path(file_name), ic)

:::

Test

from hydra import initialize, compose

with initialize(version_base=None, config_path="../../conf"):
    cfg = compose(
        config_name="generate_data",
        return_hydra_config=True,
        overrides=[
            "+dataset=test2d",
            "solver=tension_modulated_2d",
            "solver.use_nonlinear=True",
            "ic_generator=noise2d",
            "number_ics=3",
            "solver.final_time=0.001",
            "+ic_params.ic_max_amplitude=0.01",
        ],
    )
    hydra.core.hydra_config.HydraConfig.instance().set_config(cfg)
    generate_dataset(cfg)

hydra:
  run:
    dir: ${output_dir}
  sweep:
    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
    subdir: ${hydra.job.num}
  launcher:
    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
  sweeper:
    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
    max_batch_size: null
    params: null
  help:
    app_name: ${hydra.job.name}
    header: '${hydra.help.app_name} is powered by Hydra.

      '
    footer: 'Powered by Hydra (https://hydra.cc)

      Use --hydra-help to view Hydra specific help

      '
    template: '${hydra.help.header}

      == Configuration groups ==

      Compose your configuration from those groups (group=option)


      $APP_CONFIG_GROUPS


      == Config ==

      Override anything in the config (foo.bar=value)


      $CONFIG


      ${hydra.help.footer}

      '
  hydra_help:
    template: 'Hydra (${hydra.runtime.version})

      See https://hydra.cc for more info.


      == Flags ==

      $FLAGS_HELP


      == Configuration groups ==

      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
      to command line)


      $HYDRA_CONFIG_GROUPS


      Use ''--cfg hydra'' to Show the Hydra config.

      '
    hydra_help: ???
  hydra_logging:
    version: 1
    formatters:
      simple:
        format: '[%(asctime)s][HYDRA] %(message)s'
    handlers:
      console:
        class: logging.StreamHandler
        formatter: simple
        stream: ext://sys.stdout
    root:
      level: INFO
      handlers:
      - console
    loggers:
      logging_example:
        level: DEBUG
    disable_existing_loggers: false
  job_logging:
    version: 1
    formatters:
      simple:
        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
    handlers:
      console:
        class: logging.StreamHandler
        formatter: simple
        stream: ext://sys.stdout
      file:
        class: logging.FileHandler
        formatter: simple
        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
    root:
      level: INFO
      handlers:
      - console
      - file
    disable_existing_loggers: false
  env: {}
  mode: null
  searchpath: []
  callbacks: {}
  output_subdir: .hydra
  overrides:
    hydra: []
    task:
    - +dataset=test2d
    - solver=tension_modulated_2d
    - solver.use_nonlinear=True
    - ic_generator=noise2d
    - number_ics=3
    - solver.final_time=0.001
    - +ic_params.ic_max_amplitude=0.01
  job:
    name: notebook
    chdir: true
    override_dirname: +dataset=test2d,+ic_params.ic_max_amplitude=0.01,ic_generator=noise2d,number_ics=3,solver.final_time=0.001,solver.use_nonlinear=True,solver=tension_modulated_2d
    id: ???
    num: ???
    config_name: generate_data
    env_set: {}
    env_copy: []
    config:
      override_dirname:
        kv_sep: '='
        item_sep: ','
        exclude_keys: []
  runtime:
    version: 1.3.2
    version_base: '1.3'
    cwd: /home/carlos/projects/physmodjax/nbs/scripts
    config_sources:
    - path: hydra.conf
      schema: pkg
      provider: hydra
    - path: /home/carlos/projects/physmodjax/conf
      schema: file
      provider: main
    - path: ''
      schema: structured
      provider: schema
    output_dir: ???
    choices:
      dataset: test2d
      ic_generator: noise2d
      solver: tension_modulated_2d
      hydra/env: default
      hydra/callbacks: null
      hydra/job_logging: default
      hydra/hydra_logging: default
      hydra/hydra_help: default
      hydra/help: default
      hydra/sweeper: basic
      hydra/launcher: basic
      hydra/output: default
  verbose: false
solver:
  _target_: physmodjax.solver.wave2d_tenmod.Wave2dSolverTensionModulated
  sampling_rate: 16000
  final_time: 0.001
  n_gridpoints_x: 41
  length_x: 0.4
  aspect_ratio: 0.9
  rho: 1380
  h: 0.00019
  E: 3500000000.0
  nu: 0.3
  d1: 8.0e-05
  d3: 1.4e-05
  Ts0: 2620
  n_max_modes: 225
  use_nonlinear: true
ic_generator:
  _target_: physmodjax.solver.generator.Noise2d
  num_points_x: ${solver.n_gridpoints_x}
  aspect_ratio: ${solver.aspect_ratio}
name: test2d
output_dir: data/${name}
number_ics: 3
seed: 3407
ic_params:
  ic_max_amplitude: 0.01

  0%|          | 0/3 [00:00<?, ?it/s] 33%|███▎      | 1/3 [00:00<00:01,  1.07it/s, Saved file=ic_00001.npy]

bar_u shape: (225, 16)
bar_v shape: (225, 16)

 67%|██████▋   | 2/3 [00:01<00:00,  1.09it/s, Saved file=ic_00002.npy]

bar_u shape: (225, 16)
bar_v shape: (225, 16)

100%|██████████| 3/3 [00:02<00:00,  1.06it/s, Saved file=ic_00003.npy]

bar_u shape: (225, 16)
bar_v shape: (225, 16)

Convert to single file

::: {#cell-8 .cell 0=‘e’ 1=‘x’ 2=‘p’ 3=‘o’ 4=‘r’ 5=‘t’}

from fastcore.script import call_parse

:::

::: {#cell-9 .cell 0=‘e’ 1=‘x’ 2=‘p’ 3=‘o’ 4=‘r’ 5=‘t’}

@call_parse
def convert_to_single_file(
    data_dir: str,  # the directory where the files are
    output_file: str,  # the output file
    target_dtype: str = "np.float32",  # the dtype of the output file
):
    files = list(Path(data_dir).glob("*.npy"))
    files.sort(key=lambda x: x.stem)

    # load the first file to get the shape
    first_file = np.load(files[0])
    shape = (len(files), *first_file.shape)

    # convert the dtype to a numpy dtype
    target_dtype = eval(target_dtype)

    # create a memory-mapped file
    target = np.lib.format.open_memmap(
        output_file, mode="w+", shape=shape, dtype=target_dtype
    )

    for i, f in enumerate(files):
        target[i] = np.load(f).astype(target_dtype)

    print(
        f"Saved to {output_file}, has a size of {target.nbytes / 1e9} GB, and shape {target.shape}"
    )

:::