Visit the wiki pages to find some additional documentation and instructions on how view an interactive verson of these notebooks using binder.

import math
import numpy as np
import matplotlib.pyplot as plt


from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report

import os
import csv
import random

%matplotlib inline

Step 1: Load the csv files that contain the generated features¶

features = []
labels = []

with open('./papsmear-features-normal.csv', newline='') as csvfile:
    stored_features = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in stored_features:
        filename = row[0]
        class_label = row[1]
        v = np.array(row[2:len(row)], dtype=np.float32)
        f = [filename, class_label, v]
        features.append(f)
        labels.append(class_label)

with open('./papsmear-features-displastic.csv', newline='') as csvfile:
    stored_features = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in stored_features:
        filename = row[0]
        class_label = row[1]
        v = np.array(row[2:len(row)], dtype=np.float32)
        f = [filename, class_label, v]
        features.append(f)
        labels.append(class_label)

Step 2: Split the data into training and testing¶

feature_length = len(features[0][2])
features, labels = shuffle(features, labels, random_state=0)

N_train = 200
features_train = features[0:N_train]
features_test = features[N_train:len(features)]

labels_train = labels[0:N_train]
labels_test = labels[N_train:len(features)]

data_train = np.zeros((len(features_train), feature_length))
data_test  = np.zeros((len(features_test), feature_length))

for i in range(0, len(features_train)):
    data_train[i, :] = features_train[i][2]
    
for i in range(0, len(features_test)):
    data_test[i, :] = features_test[i][2]

Step 3: Set up and train the classifier¶

clf = LogisticRegression(random_state=0, max_iter=500)

clf.fit(data_train, np.asarray(labels_train))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

prediction = clf.predict(data_test)
print(prediction)

['0' '0' '1' '1' '1' '0' '1' '1' '1' '0' '0' '1' '0' '1' '0' '1' '0' '0'
 '1' '1' '0' '1' '1' '0' '0' '1' '1' '1' '0' '0' '0' '0' '1' '1' '1' '0'
 '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '0' '0' '0' '1' '1'
 '1' '1' '1' '0' '1' '1' '1' '1' '0' '1' '0' '1' '1' '1' '1' '0' '0']

print(classification_report(labels_test, prediction))

              precision    recall  f1-score   support

           0       0.64      0.94      0.76        17
           1       0.98      0.83      0.90        54

    accuracy                           0.86        71
   macro avg       0.81      0.89      0.83        71
weighted avg       0.90      0.86      0.87        71