Skip to content

Aggregation and merging

MergeDatasets

Bases: GlobalStep

Step for merging datasets.

Merge is done using concatenation. Optional shuffling by providing a seed.

Source code in ragfoundry/processing/global_steps/aggregation.py
class MergeDatasets(GlobalStep):
    """
    Step for merging datasets.

    Merge is done using concatenation. Optional shuffling by providing a seed.
    """

    def __init__(self, output, shuffle=None, **kwargs):
        """
        Args:
            output (str): Name of the output dataset. Should be unique.
            shuffle (int, optional): seed for shuffling. Default is None.
        """
        super().__init__(**kwargs)
        self.output = output
        self.shuffle = shuffle
        self.completed = False
        self.cache_step = False

    def process(self, dataset_name, datasets, **kwargs):
        if not self.completed:
            data = concatenate_datasets([datasets[name] for name in self.inputs])
            if self.shuffle:
                data = data.shuffle(self.shuffle)
            datasets[self.output] = data
            self.completed = True

__init__(output, shuffle=None, **kwargs)

Parameters:

  • output (str) –

    Name of the output dataset. Should be unique.

  • shuffle (int, default: None ) –

    seed for shuffling. Default is None.

Source code in ragfoundry/processing/global_steps/aggregation.py
def __init__(self, output, shuffle=None, **kwargs):
    """
    Args:
        output (str): Name of the output dataset. Should be unique.
        shuffle (int, optional): seed for shuffling. Default is None.
    """
    super().__init__(**kwargs)
    self.output = output
    self.shuffle = shuffle
    self.completed = False
    self.cache_step = False