# -*- coding: utf-8 -*-
# LightTwinSVM Program - Simple and Fast
# Version: 0.6.0 - 2019-03-31
# Developer: Mir, A. (mir-am@hotmail.com)
# License: GNU General Public License v3.0
"""
In this module, functions for reading and pre-processing datasets are defined.
"""
from os.path import splitext, split
from sklearn.datasets import load_svmlight_file
import numpy as np
import csv
[docs]def conv_str_fl(data):
"""
It converts string data to float for computation.
Parameters
----------
data : array-like, shape (n_samples, n_features)
Training samples, where n_samples is the number of samples
and n_features is the number of features.
Returns
-------
array-like
A numerical dataset which is suitable for futher computation.
"""
temp_data = np.zeros(data.shape)
# Read rows
for i in range(data.shape[0]):
# Read coloums
for j in range(data.shape[1]):
temp_data[i][j] = float(data[i][j])
return temp_data
[docs]def read_data(filename, header=True):
"""
It converts a CSV dataset to NumPy arrays for further operations
like training the TwinSVM classifier.
Parameters
----------
filename : str
Path to the dataset file.
header : boolean, optional (default=True)
Ignores first row of dataset which contains header names.
Returns
-------
data_train : array-like, shape (n_samples, n_features)
Training samples in NumPy array.
data_labels : array-like, shape(n_samples,)
Class labels of training samples.
file_name : str
Dataset's filename.
"""
data = open(filename, 'r')
data_csv = csv.reader(data, delimiter=',')
# Ignore header names
if not header:
data_array = np.array(list(data_csv))
else:
data_array = np.array(list(data_csv)[1:]) # [1:] for removing headers
data.close()
# Shuffle data
#np.random.shuffle(data_array)
# Convers string data to float
data_train = conv_str_fl(data_array[:, 1:])
data_labels = np.array([int(i) for i in data_array[:, 0]])
file_name = splitext(split(filename)[-1])[0]
return data_train, data_labels, file_name
[docs]def read_libsvm(filename):
"""
It reads `LIBSVM <https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/>`_
data files for doing classification using the TwinSVM model.
Parameters
----------
filename : str
Path to the LIBSVM data file.
Returns
-------
array-like
Training samples.
array-like
Class labels of training samples.
str
Dataset's filename
"""
libsvm_data = load_svmlight_file(filename)
file_name = splitext(split(filename)[-1])[0]
# Converting sparse CSR matrix to NumPy array
return libsvm_data[0].toarray(), libsvm_data[1].astype(np.int), file_name