HAM10000¶

Skin Cancer MNIST: HAM10000 a large collection of multi-source dermatoscopic images of pigmented lesions¶

https://www.kaggle.com/kmader/skin-cancer-mnist-ham10000 ¶

Categories:¶

actinic keratoses and intraepithelial carcinoma / Bowen's disease (akiec)¶

basal cell carcinoma (bcc)¶

benign keratosis-like lesions (solar lentigines / seborrheic keratoses and lichen-planus like keratoses, bkl)¶

dermatofibroma (df)¶

melanoma (mel)¶

melanocytic nevi (nv)¶

vascular lesions (angiomas, angiokeratomas, pyogenic granulomas and hemorrhage, vasc)¶

%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.vision import *
from fastai.metrics import accuracy

PATH = "/home/katey/DeepLearning/Data/HAM10000/"

label_csv = f'{PATH}HAM10000_metadata.csv'

label_df = pd.read_csv(label_csv)

label_df.head()

Number of images

len(label_df.image_id.unique())

10015

Number of lesions

len(label_df.lesion_id.unique())

7470

Note that perhaps a quarter of lesions have multiple images. We need to ensure that images from the same lesion do not appear in both the training and validation sets. Take a random sample of the unique lesion ids

np.random.seed(827)
val_lesions = list(np.random.choice(label_df.lesion_id.unique(),size = 3000))

val_idxs = label_df[label_df['lesion_id'].isin(val_lesions)].index

len(val_idxs)

3303

Make df with just the image-id and dx

reduce_label_df = label_df.drop(columns = ['lesion_id','dx_type','age','sex','localization'])

reduce_label_df.columns = ['filename','label']

reduce_label_df.head()

Count how many lesions there are of each type

reduce_label_df.pivot_table(index="label", aggfunc=len).sort_values('filename', ascending=False)

How many lesions there are of each type in the validation set

reduce_label_df.iloc[val_idxs].pivot_table(index="label", aggfunc=len).sort_values('filename', ascending=False)

label_csv = f'{PATH}labels.csv'
reduce_label_df.to_csv(label_csv, index = False)

tfms = get_transforms(flip_vert = True)

bs = 64
sz = 64

src = (ImageItemList.from_csv(PATH, 'labels.csv', folder = 'train', suffix = '.jpg')
        .split_by_idx(val_idxs)
        .label_from_df())

data = (src.transform(tfms, size=sz)
        .databunch(bs=bs).normalize(imagenet_stats))

data.show_batch(rows=5, figsize=(12,10))

learn = create_cnn(data, models.resnet34, metrics=accuracy)

learn.lr_find()
learn.recorder.plot()

LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.

lr = 1e-2

learn.fit_one_cycle(5, lr)

learn.unfreeze()

learn.lr_find()
learn.recorder.plot()

LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.

learn.fit_one_cycle(5, slice(1e-5, 1e-4))

learn.save('mod-resnet34-sz64')

learn.fit_one_cycle(10, slice(1e-5, 1e-4/2))

Resize images to 128

sz = 128

data = (src.transform(tfms, size=sz)
        .databunch(bs=bs).normalize(imagenet_stats))

data.show_batch(rows=5, figsize=(12,10))

learn.data = data
data.train_ds[0][0].shape

torch.Size([3, 128, 128])

learn.freeze()

learn.lr_find()
learn.recorder.plot()

LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.

lr=1e-3

learn.fit_one_cycle(5, lr)

learn.unfreeze()

learn.lr_find()
learn.recorder.plot()

LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.

learn.fit_one_cycle(5, slice(1e-5, 1e-4))

learn.save('mod-resnet34-sz128')

learn.fit_one_cycle(10, slice(1e-5, 1e-4/2))

learn.save('mod-resnet34-sz128')

Resize to 256

sz = 256
bs = 32

data = (src.transform(tfms, size=sz)
        .databunch(bs=bs).normalize(imagenet_stats))

data.show_batch(rows=5, figsize=(12,10))

learn.data = data
data.train_ds[0][0].shape

torch.Size([3, 256, 256])

learn.freeze()

learn.lr_find()
learn.recorder.plot()

LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.

lr=1e-2

learn.fit_one_cycle(5, lr)

learn.unfreeze()

learn.lr_find()
learn.recorder.plot()

LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.

learn.fit_one_cycle(5, slice(1e-5, 1e-4))

learn.save('mod-resnet34-sz256')

Try a larger architecture - Resnet 50

learn = create_cnn(data, models.resnet50, metrics=accuracy)

learn.lr_find()
learn.recorder.plot()

LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.

lr = 1e-2

learn.fit_one_cycle(5, lr)

learn.unfreeze()

learn.lr_find()
learn.recorder.plot()

LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.

learn.fit_one_cycle(5, slice(1e-5, 1e-4))

learn.save('mod-resnet50-sz256')

learn.fit_one_cycle(10, slice(1e-5, 1e-4/2))

interp = ClassificationInterpretation.from_learner(learn)

losses,idxs = interp.top_losses()

len(data.valid_ds)==len(losses)==len(idxs)

True

interp.plot_top_losses(16, figsize=(12,10))

interp.plot_confusion_matrix(figsize=(5,5), dpi=90)

interp.most_confused(min_val=2)

[('mel', 'nv', 121),
 ('bkl', 'nv', 62),
 ('nv', 'bkl', 42),
 ('mel', 'bkl', 38),
 ('bkl', 'mel', 32),
 ('nv', 'mel', 23),
 ('bcc', 'nv', 18),
 ('akiec', 'bkl', 16),
 ('vasc', 'nv', 12),
 ('bkl', 'akiec', 11),
 ('akiec', 'bcc', 9),
 ('akiec', 'mel', 9),
 ('bcc', 'bkl', 9),
 ('df', 'nv', 9),
 ('akiec', 'nv', 8),
 ('bcc', 'mel', 8),
 ('bkl', 'bcc', 8),
 ('bcc', 'akiec', 7),
 ('mel', 'akiec', 7),
 ('nv', 'bcc', 5),
 ('bcc', 'df', 4),
 ('bkl', 'df', 3),
 ('nv', 'akiec', 3)]

	lesion_id	image_id	dx	dx_type	age	sex	localization
0	HAM_0000118	ISIC_0027419	bkl	histo	80.0	male	scalp
1	HAM_0000118	ISIC_0025030	bkl	histo	80.0	male	scalp
2	HAM_0002730	ISIC_0026769	bkl	histo	80.0	male	scalp
3	HAM_0002730	ISIC_0025661	bkl	histo	80.0	male	scalp
4	HAM_0001466	ISIC_0031633	bkl	histo	75.0	male	ear

epoch	train_loss	valid_loss	accuracy
1	1.186675	0.858878	0.710263
2	0.903502	0.738203	0.734484
3	0.769199	0.687939	0.752346
4	0.693994	0.651152	0.761732
5	0.661096	0.639313	0.769301

epoch	train_loss	valid_loss	accuracy
1	0.650552	0.619801	0.779897
2	0.631828	0.603539	0.780805
3	0.582781	0.580451	0.793218
4	0.560097	0.574121	0.795338
5	0.532885	0.569416	0.801695

epoch	train_loss	valid_loss	accuracy
1	0.542999	0.571448	0.798971
2	0.544735	0.566700	0.800484
3	0.552432	0.563527	0.794732
4	0.512192	0.560285	0.796549
5	0.510227	0.558457	0.802906
6	0.497188	0.552658	0.797154
7	0.477804	0.555176	0.799576
8	0.453000	0.548926	0.797457
9	0.450656	0.551730	0.797154
10	0.452580	0.552420	0.796549

epoch	train_loss	valid_loss	accuracy
1	0.593146	0.598795	0.782016
2	0.561943	0.569351	0.791099
3	0.546292	0.552666	0.798971
4	0.524803	0.556066	0.797760
5	0.512741	0.550822	0.800182

epoch	train_loss	valid_loss	accuracy
1	0.521711	0.528288	0.805631
2	0.498406	0.503224	0.815319
3	0.450675	0.501416	0.818347
4	0.408610	0.499774	0.820466
5	0.386717	0.494122	0.818347

epoch	train_loss	valid_loss	accuracy
1	0.387193	0.491476	0.819558
2	0.383850	0.497077	0.818044
3	0.364185	0.497040	0.818347
4	0.344495	0.497699	0.822283
5	0.333906	0.514679	0.818347
6	0.313578	0.509936	0.820769
7	0.297167	0.508580	0.821980
8	0.280208	0.506635	0.825613
9	0.270249	0.503174	0.825310
10	0.267939	0.504659	0.822586

epoch	train_loss	valid_loss	accuracy
1	0.442882	0.552815	0.808053
2	0.463766	0.558989	0.808053
3	0.406110	0.491836	0.826824
4	0.353857	0.502712	0.829852
5	0.319887	0.486766	0.836815

epoch	train_loss	valid_loss	accuracy
1	0.313277	0.490889	0.827732
2	0.306080	0.495885	0.837421
3	0.294801	0.487527	0.841356
4	0.247532	0.472140	0.844989
5	0.217842	0.462418	0.847109

epoch	train_loss	valid_loss	accuracy
1	0.821564	0.747630	0.749622
2	0.737863	0.688934	0.749319
3	0.626133	0.548729	0.808053
4	0.522450	0.472274	0.830154
5	0.437214	0.432627	0.843173

epoch	train_loss	valid_loss	accuracy
1	0.430947	0.446454	0.834393
2	0.423921	0.456202	0.839540
3	0.369562	0.410024	0.853467
4	0.355850	0.392643	0.854678
5	0.320526	0.397311	0.855889

epoch	train_loss	valid_loss	accuracy
1	0.338695	0.389523	0.859522
2	0.331810	0.402507	0.848017
3	0.332133	0.464237	0.836210
4	0.339843	0.409317	0.853467
5	0.297568	0.400306	0.853769
6	0.283094	0.399265	0.856191
7	0.268882	0.408741	0.855283
8	0.258930	0.391421	0.854678
9	0.223862	0.392123	0.856494
10	0.233325	0.393627	0.856494

	filename
label
nv	6705
mel	1113
bkl	1099
bcc	514
akiec	327
vasc	142
df	115

	filename
label
nv	2199
bkl	366
mel	347
bcc	194
akiec	114
vasc	54
df	29