在机器学习或深度学习中,它总是基于数据是独立和相同分布的假设,数据的顺序应该是随机的。
在一个模型训练任务中通常有多次迭代。在每一次迭代中对数据进行洗牌会提高模型的泛化性能。
import numpy as np
from tensorflow.python.data.ops import dataset_ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import math_ops
def timeseries_dataset_from_array(
data,
sequence_length,
sequence_stride=1,
sampling_rate=1,
batch_size=128,
shuffle=False,
seed=None,
start_index=None,
end_index=None):
if start_index is None:
start_index = 0
if end_index is None:
end_index = len(data)
# Determine the lowest dtype to store start positions (to lower memory usage).
num_seqs = end_index - start_index - (sequence_length * sampling_rate) + 1
if num_seqs < 2147483647:
index_dtype = 'int32'
else:
index_dtype = 'int64'
# Generate start positions
start_positions = np.arange(0, num_seqs, sequence_stride, dtype=index_dtype)
if shuffle:
if seed is None:
seed = np.random.randint(1e6)
rng = np.random.RandomState(seed)
rng.shuffle(start_positions)
sequence_length = math_ops.cast(sequence_length, dtype=index_dtype)
sampling_rate = math_ops.cast(sampling_rate, dtype=index_dtype)
positions_ds = dataset_ops.Dataset.from_tensors(start_positions).repeat()
# For each initial window position, generates indices of the window elements
indices = dataset_ops.Dataset.zip(
(dataset_ops.Dataset.range(len(start_positions)), positions_ds)).map(
lambda i, positions: math_ops.range( # pylint: disable=g-long-lambda
positions[i],
positions[i] + sequence_length * sampling_rate,
sampling_rate),
num_parallel_calls=dataset_ops.AUTOTUNE)
dataset = sequences_from_indices(data, indices, start_index, end_index)
if shuffle:
# Shuffle locally at each iteration
dataset = dataset.shuffle(buffer_size=batch_size * 8, seed=seed)
dataset = dataset.batch(batch_size)
return dataset
def sequences_from_indices(array, indices_ds, start_index, end_index):
dataset = dataset_ops.Dataset.from_tensors(array[start_index: end_index])
dataset = dataset_ops.Dataset.zip((dataset.repeat(), indices_ds)).map(
lambda steps, inds: array_ops.gather(steps, inds), # pylint: disable=unnecessary-lambda
num_parallel_calls=dataset_ops.AUTOTUNE)
return dataset
X = np.arange(12)
input_dataset = timeseries_dataset_from_array(
X, sequence_length=3, sequence_stride=3, shuffle=True, batch_size=2)
for i in range(3):
print("--------------iteration", i)
for inputs in input_dataset:
print(inputs)
Outputs:
--------------iteration 0
tf.Tensor( [[ 6 7 8] [ 9 10 11]], shape=(2, 3), dtype=int32)
tf.Tensor( [[3 4 5] [0 1 2]], shape=(2, 3), dtype=int32)
--------------iteration 1
tf.Tensor( [[3 4 5] [6 7 8]], shape=(2, 3), dtype=int32)
tf.Tensor( [[ 0 1 2] [ 9 10 11]], shape=(2, 3), dtype=int32)