import numpy as np
import os

np.random.seed(5112017)
data_path = "/home/antreas/mlpractical_2016-2017/mlpractical/data"
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

train_data = []
train_labels = []

for subdir, dir, files in os.walk(data_path):
    for file in files:
        if not("html" in file) and not("meta" in file) and not(".txt"in file) and ("msd-25" in file):
            filepath = os.path.join(subdir, file)
            print(filepath)
            data_batch = np.load(filepath)
            print(filepath, data_batch.keys())
            if "test" not in file and "var" not in file:
                train_data.extend(data_batch['inputs'])
                train_labels.extend(data_batch['targets'])

x_train = np.array(train_data)
y_train = np.array(train_labels)


ids = np.arange(x_train.shape[0])
np.random.shuffle(ids)

x_train = x_train[ids]
y_train = y_train[ids]

val_start_index = int(0.75 * x_train.shape[0])
test_start_index = int(0.85 * x_train.shape[0])
print(val_start_index)

x_val = x_train[val_start_index:]
y_val = y_train[val_start_index:]

x_test = x_train[test_start_index:]
y_test = y_train[test_start_index:]

x_train = x_train[:val_start_index]
y_train = y_train[:val_start_index]


# train_pack = np.array({"inputs": x_train, "targets": y_train})
# validation_pack = np.array({"inputs": x_val, "targets": y_val})
# testing_pack = np.array({"inputs": x_test, "targets": y_test})

np.savez("data/msd25-train", inputs=x_train, targets=y_train)
np.savez("data/msd25-valid", inputs=x_val, targets=y_val)
np.savez("data/msd25-test", inputs=x_test, targets=y_test)
print(x_train.shape, y_train.shape, x_val.shape)