%reload_ext autoreload
%autoreload 2
%matplotlib inline
from fastai.vision import *
from fastai.metrics import accuracy
PATH = "/home/katey/DeepLearning/Data/HAM10000/"
label_csv = f'{PATH}HAM10000_metadata.csv'
label_df = pd.read_csv(label_csv)
label_df.head()
Number of images
len(label_df.image_id.unique())
Number of lesions
len(label_df.lesion_id.unique())
Note that perhaps a quarter of lesions have multiple images. We need to ensure that images from the same lesion do not appear in both the training and validation sets. Take a random sample of the unique lesion ids
np.random.seed(827)
val_lesions = list(np.random.choice(label_df.lesion_id.unique(),size = 3000))
val_idxs = label_df[label_df['lesion_id'].isin(val_lesions)].index
len(val_idxs)
Make df with just the image-id and dx
reduce_label_df = label_df.drop(columns = ['lesion_id','dx_type','age','sex','localization'])
reduce_label_df.columns = ['filename','label']
reduce_label_df.head()
Count how many lesions there are of each type
reduce_label_df.pivot_table(index="label", aggfunc=len).sort_values('filename', ascending=False)
How many lesions there are of each type in the validation set
reduce_label_df.iloc[val_idxs].pivot_table(index="label", aggfunc=len).sort_values('filename', ascending=False)
label_csv = f'{PATH}labels.csv'
reduce_label_df.to_csv(label_csv, index = False)
tfms = get_transforms(flip_vert = True)
bs = 64
sz = 64
src = (ImageItemList.from_csv(PATH, 'labels.csv', folder = 'train', suffix = '.jpg')
.split_by_idx(val_idxs)
.label_from_df())
data = (src.transform(tfms, size=sz)
.databunch(bs=bs).normalize(imagenet_stats))
data.show_batch(rows=5, figsize=(12,10))
learn = create_cnn(data, models.resnet34, metrics=accuracy)
learn.lr_find()
learn.recorder.plot()
lr = 1e-2
learn.fit_one_cycle(5, lr)
learn.unfreeze()
learn.lr_find()
learn.recorder.plot()
learn.fit_one_cycle(5, slice(1e-5, 1e-4))
learn.save('mod-resnet34-sz64')
learn.fit_one_cycle(10, slice(1e-5, 1e-4/2))
Resize images to 128
sz = 128
data = (src.transform(tfms, size=sz)
.databunch(bs=bs).normalize(imagenet_stats))
data.show_batch(rows=5, figsize=(12,10))
learn.data = data
data.train_ds[0][0].shape
learn.freeze()
learn.lr_find()
learn.recorder.plot()
lr=1e-3
learn.fit_one_cycle(5, lr)
learn.unfreeze()
learn.lr_find()
learn.recorder.plot()
learn.fit_one_cycle(5, slice(1e-5, 1e-4))
learn.save('mod-resnet34-sz128')
learn.fit_one_cycle(10, slice(1e-5, 1e-4/2))
learn.save('mod-resnet34-sz128')
Resize to 256
sz = 256
bs = 32
data = (src.transform(tfms, size=sz)
.databunch(bs=bs).normalize(imagenet_stats))
data.show_batch(rows=5, figsize=(12,10))
learn.data = data
data.train_ds[0][0].shape
learn.freeze()
learn.lr_find()
learn.recorder.plot()
lr=1e-2
learn.fit_one_cycle(5, lr)
learn.unfreeze()
learn.lr_find()
learn.recorder.plot()
learn.fit_one_cycle(5, slice(1e-5, 1e-4))
learn.save('mod-resnet34-sz256')
Try a larger architecture - Resnet 50
learn = create_cnn(data, models.resnet50, metrics=accuracy)
learn.lr_find()
learn.recorder.plot()
lr = 1e-2
learn.fit_one_cycle(5, lr)
learn.unfreeze()
learn.lr_find()
learn.recorder.plot()
learn.fit_one_cycle(5, slice(1e-5, 1e-4))
learn.save('mod-resnet50-sz256')
learn.fit_one_cycle(10, slice(1e-5, 1e-4/2))
interp = ClassificationInterpretation.from_learner(learn)
losses,idxs = interp.top_losses()
len(data.valid_ds)==len(losses)==len(idxs)
interp.plot_top_losses(16, figsize=(12,10))
interp.plot_confusion_matrix(figsize=(5,5), dpi=90)
interp.most_confused(min_val=2)