%reload_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from fastai.tabular import *
path = Path('Data/')
re_df = pd.read_csv(path/'re_final.csv')
re_df.head()
n = len(re_df); n
re_df.drop(['date','County_Name'], axis = 1, inplace = True)
re_df.fips = re_df.fips.astype(str)
re_asp_df = re_df.dropna(subset = ['spriceadj_resp'])
re_asp_df = re_asp_df.drop(['preduc_resp','zhvi_resp', 'doz_resp', 'mlist_resp',
'salecounts_resp', 'forpct_resp'], axis = 1)
n_asp = len(re_asp_df); n_asp
re_asp_df = re_asp_df.reset_index(drop=True)
re_asp_df['year'].dtypes
re_asp_df['year'].max()
re_asp_df_train = re_asp_df[re_asp_df.year != 2017]
len(re_asp_df_train)
re_asp_df_valid = re_asp_df[re_asp_df.year == 2017]
len(re_asp_df_valid)
valid_idx = re_asp_df_valid.index
valid_idx
dep_var = ['spriceadj_resp']
re_asp_df.columns
len(re_asp_df.columns)
cat_vars = list(re_asp_df.columns[np.r_[0:3, 71:74]])
cont_vars = list(re_asp_df.columns[3:71])
df = re_asp_df[cat_vars + cont_vars + dep_var].copy()
procs=[FillMissing, Categorify, Normalize]
np.random.seed(4537)
data = (TabularList.from_df(df, path=path, cat_names=cat_vars, cont_names=cont_vars, procs=procs)
.split_by_idx(valid_idx)
.label_from_df(cols=dep_var, label_cls=FloatList, log=True)
.databunch())
max_log_y = np.log(np.max(re_asp_df_train['spriceadj_resp'])*1.2)
y_range = torch.tensor([0, max_log_y], device=defaults.device)
learn = tabular_learner(data, layers=[1000,500], ps=[0.001,0.01], emb_drop=0.04,
y_range=y_range, metrics=exp_rmspe)
len(data.train_ds.cont_names)
learn.lr_find()
learn.recorder.plot()
learn.fit_one_cycle(5, 1e-2, wd=0.2)
learn.save('rep1')
def emb_pca(num_embed, cat_name, df = df):
embed_array = learn.model.embeds[num_embed].weight[1:]
temp = embed_array.cpu().detach().numpy()
temp = pd.DataFrame(temp).add_prefix('emb_')
temp[cat_name] = df[cat_name].unique()
temp = pd.merge(df, temp, how = 'left', on = cat_name)
embed_df = temp.filter(regex='^emb_', axis = 1)
pca = PCA(n_components = 2)
principalComponents = pca.fit_transform(embed_df)
pc_embed_df = pd.DataFrame(data=principalComponents, columns = ['emb_pc1', 'emb_pc2'])
pc_emb_df = pd.concat([pc_embed_df, df[[cat_name]]], axis = 1)
return pc_emb_df
emb_data = emb_pca(num_embed = 5, cat_name = 'division')
emb_data.to_csv(path/'division_embed.csv', index = False)
emb_data = emb_pca(num_embed = 4, cat_name = 'region')
emb_data.to_csv(path/'region_embed.csv', index = False)
emb_data = emb_pca(num_embed = 3, cat_name = 'state')
emb_data.to_csv(path/'state_embed.csv', index = False)
emb_data = emb_pca(num_embed = 2, cat_name = 'month')
emb_data.to_csv(path/'month_embed.csv', index = False)