Visit the wiki pages to find some additional documentation and instructions on how view an interactive verson of these notebooks using binder.

In [1]:
import math
import numpy as np
import matplotlib.pyplot as plt


from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report

import os
import csv
import random

%matplotlib inline

Step 1: Load the csv files that contain the generated features

In [2]:
features = []
labels = []

with open('./papsmear-features-normal.csv', newline='') as csvfile:
    stored_features = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in stored_features:
        filename = row[0]
        class_label = row[1]
        v = np.array(row[2:len(row)], dtype=np.float32)
        f = [filename, class_label, v]
        features.append(f)
        labels.append(class_label)
In [3]:
with open('./papsmear-features-displastic.csv', newline='') as csvfile:
    stored_features = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in stored_features:
        filename = row[0]
        class_label = row[1]
        v = np.array(row[2:len(row)], dtype=np.float32)
        f = [filename, class_label, v]
        features.append(f)
        labels.append(class_label)

Step 2: Split the data into training and testing

In [4]:
feature_length = len(features[0][2])
features, labels = shuffle(features, labels, random_state=0)
In [5]:
N_train = 200
features_train = features[0:N_train]
features_test = features[N_train:len(features)]

labels_train = labels[0:N_train]
labels_test = labels[N_train:len(features)]
In [6]:
data_train = np.zeros((len(features_train), feature_length))
data_test  = np.zeros((len(features_test), feature_length))
In [7]:
for i in range(0, len(features_train)):
    data_train[i, :] = features_train[i][2]
    
for i in range(0, len(features_test)):
    data_test[i, :] = features_test[i][2]

Step 3: Set up and train the classifier

In [8]:
clf = LogisticRegression(random_state=0, max_iter=500)
In [9]:
clf.fit(data_train, np.asarray(labels_train))
Out[9]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
In [10]:
prediction = clf.predict(data_test)
print(prediction)
['0' '0' '1' '1' '1' '0' '1' '1' '1' '0' '0' '1' '0' '1' '0' '1' '0' '0'
 '1' '1' '0' '1' '1' '0' '0' '1' '1' '1' '0' '0' '0' '0' '1' '1' '1' '0'
 '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '0' '0' '0' '1' '1'
 '1' '1' '1' '0' '1' '1' '1' '1' '0' '1' '0' '1' '1' '1' '1' '0' '0']
In [11]:
print(classification_report(labels_test, prediction))
              precision    recall  f1-score   support

           0       0.64      0.94      0.76        17
           1       0.98      0.83      0.90        54

    accuracy                           0.86        71
   macro avg       0.81      0.89      0.83        71
weighted avg       0.90      0.86      0.87        71

In [ ]: