Visualise the data using Principal Component Analysis. Features in the CSV file have been precomputed.
import math
import numpy as np
import matplotlib.pyplot as plt
import skimage
from skimage import io, measure, exposure
from skimage.color import rgba2rgb, rgb2gray
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import os
import csv
import random
%matplotlib inline
features = []
with open('../images/papsmear-features-normal.csv', newline='') as csvfile:
stored_features = csv.reader(csvfile, delimiter=',', quotechar='|')
for row in stored_features:
filename = row[0]
class_label = row[1]
v = np.array(row[2:len(row)], dtype=np.float32)
f = [filename, class_label, v]
features.append(f)
with open('../images/papsmear-features-displastic.csv', newline='') as csvfile:
stored_features = csv.reader(csvfile, delimiter=',', quotechar='|')
for row in stored_features:
filename = row[0]
class_label = row[1]
v = np.array(row[2:len(row)], dtype=np.float32)
f = [filename, class_label, v]
features.append(f)
feature_length = len(features[0][2])
print(feature_length)
random.shuffle(features)
features_train = features[0:280]
features_test = features[280:len(features)]
data_train = np.zeros((len(features_train), feature_length))
colours_train = np.zeros(len(features_train))
for i in range(0, len(features_train)):
data_train[i, :] = features_train[i][2]
if (features_train[i][1] == '0'):
colours_train[i] = 0.25
else:
colours_train[i] = 0.75
pca = PCA(n_components=2)
train_pca = pca.fit_transform(data_train)
train_projected = pca.transform(data_train)
plt.scatter(train_projected[:,0], train_projected[:, 1], c=colours_train, marker='o');