Source code for dataproc

# -*- coding: utf-8 -*-

# LightTwinSVM Program - Simple and Fast
# Version: 0.6.0 - 2019-03-31
# Developer: Mir, A. (mir-am@hotmail.com)
# License: GNU General Public License v3.0

"""
In this module, functions for reading and pre-processing datasets are defined.
"""


from os.path import splitext, split
from sklearn.datasets import load_svmlight_file
import numpy as np
import csv


[docs]def conv_str_fl(data): """ It converts string data to float for computation. Parameters ---------- data : array-like, shape (n_samples, n_features) Training samples, where n_samples is the number of samples and n_features is the number of features. Returns ------- array-like A numerical dataset which is suitable for futher computation. """ temp_data = np.zeros(data.shape) # Read rows for i in range(data.shape[0]): # Read coloums for j in range(data.shape[1]): temp_data[i][j] = float(data[i][j]) return temp_data
[docs]def read_data(filename, header=True): """ It converts a CSV dataset to NumPy arrays for further operations like training the TwinSVM classifier. Parameters ---------- filename : str Path to the dataset file. header : boolean, optional (default=True) Ignores first row of dataset which contains header names. Returns ------- data_train : array-like, shape (n_samples, n_features) Training samples in NumPy array. data_labels : array-like, shape(n_samples,) Class labels of training samples. file_name : str Dataset's filename. """ data = open(filename, 'r') data_csv = csv.reader(data, delimiter=',') # Ignore header names if not header: data_array = np.array(list(data_csv)) else: data_array = np.array(list(data_csv)[1:]) # [1:] for removing headers data.close() # Shuffle data #np.random.shuffle(data_array) # Convers string data to float data_train = conv_str_fl(data_array[:, 1:]) data_labels = np.array([int(i) for i in data_array[:, 0]]) file_name = splitext(split(filename)[-1])[0] return data_train, data_labels, file_name
[docs]def read_libsvm(filename): """ It reads `LIBSVM <https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/>`_ data files for doing classification using the TwinSVM model. Parameters ---------- filename : str Path to the LIBSVM data file. Returns ------- array-like Training samples. array-like Class labels of training samples. str Dataset's filename """ libsvm_data = load_svmlight_file(filename) file_name = splitext(split(filename)[-1])[0] # Converting sparse CSR matrix to NumPy array return libsvm_data[0].toarray(), libsvm_data[1].astype(np.int), file_name