Skip to content

Output

HFHubOutput

Bases: GlobalStep

Simple class to output the dataset to Hugging Face Hub.

Caching is disabled as this step does not manipulate the dataset hence no need for caching.

Source code in ragfoundry/processing/global_steps/output.py
class HFHubOutput(GlobalStep):
    """
    Simple class to output the dataset to Hugging Face Hub.

    Caching is disabled as this step does not manipulate the dataset hence no need for caching.
    """

    def __init__(self, hfhub_tag, private=True, **kwargs):
        """
        Args:
            hfhub_tag (str): Tag for the Hugging Face Hub.
            private (bool): Whether the dataset should be private or not. Default is True.
        """
        super().__init__(**kwargs)
        self.hfhub_tag = hfhub_tag
        self.private = private
        self.cache_step = False

    def process(self, dataset_name, datasets, **kwargs):
        datasets[dataset_name].push_to_hub(self.hfhub_tag, private=self.private)

__init__(hfhub_tag, private=True, **kwargs)

Parameters:

  • hfhub_tag (str) –

    Tag for the Hugging Face Hub.

  • private (bool, default: True ) –

    Whether the dataset should be private or not. Default is True.

Source code in ragfoundry/processing/global_steps/output.py
def __init__(self, hfhub_tag, private=True, **kwargs):
    """
    Args:
        hfhub_tag (str): Tag for the Hugging Face Hub.
        private (bool): Whether the dataset should be private or not. Default is True.
    """
    super().__init__(**kwargs)
    self.hfhub_tag = hfhub_tag
    self.private = private
    self.cache_step = False

OutputData

Bases: GlobalStep

Simple class to output the dataset to a jsonl file.

Caching is disabled as this step does not manipulate the dataset hence no need for caching.

Source code in ragfoundry/processing/global_steps/output.py
class OutputData(GlobalStep):
    """
    Simple class to output the dataset to a jsonl file.

    Caching is disabled as this step does not manipulate the dataset hence no need for caching.
    """

    def __init__(self, prefix, filename=None, directory=None, **kwargs):
        """
        Args:
            prefix (str): Prefix for the output.
            filename (str, optional): Name of the output file. If not provided, the output file name will be generated based on the prefix and dataset name.
            directory (str, optional): Directory to save the output file. If not provided, the output file will be saved in the current directory.

        The output name is `{prefix}-{dataset_keyname/filename}.jsonl` if `filename` is not provided.
        """
        super().__init__(**kwargs)
        self.prefix = prefix
        self.filename = filename
        self.dir = directory
        self.cache_step = False

    def process(self, dataset_name, datasets, **kwargs):
        if self.filename:
            name = self.filename
        else:
            name = dataset_name
        fname = f"{self.prefix}-{name}.jsonl"
        if self.dir is not None:
            fname = os.path.join(self.dir, fname) if self.dir else fname
        datasets[dataset_name].to_json(fname, lines=True)

__init__(prefix, filename=None, directory=None, **kwargs)

Parameters:

  • prefix (str) –

    Prefix for the output.

  • filename (str, default: None ) –

    Name of the output file. If not provided, the output file name will be generated based on the prefix and dataset name.

  • directory (str, default: None ) –

    Directory to save the output file. If not provided, the output file will be saved in the current directory.

The output name is {prefix}-{dataset_keyname/filename}.jsonl if filename is not provided.

Source code in ragfoundry/processing/global_steps/output.py
def __init__(self, prefix, filename=None, directory=None, **kwargs):
    """
    Args:
        prefix (str): Prefix for the output.
        filename (str, optional): Name of the output file. If not provided, the output file name will be generated based on the prefix and dataset name.
        directory (str, optional): Directory to save the output file. If not provided, the output file will be saved in the current directory.

    The output name is `{prefix}-{dataset_keyname/filename}.jsonl` if `filename` is not provided.
    """
    super().__init__(**kwargs)
    self.prefix = prefix
    self.filename = filename
    self.dir = directory
    self.cache_step = False