ls

sample_data/  train.csv

A quick look at data

import pandas as pd
iowa_file_path = 'train.csv'
home_data = pd.read_csv(iowa_file_path)
home_data.describe()

avg_lot_size = round(home_data['LotArea'].mean())
print(f'avg_lot_size : {avg_lot_size}')

newest_home_age = 2022 - round(home_data['YearBuilt'].max())
print(f'newest_home_age: {newest_home_age}')

avg_lot_size : 10517
newest_home_age: 12

Step 1: Specify Prediction Target
Step 2: Create X

home_data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
       'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
       'SaleCondition', 'SalePrice'],
      dtype='object')

ðŸ‘† prediction target : 'SalePrice'

#home_data = home_data.dropna(axis=0)
#no need to run this code on this data

y = home_data.SalePrice

home_features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = home_data[home_features]
print(X.describe())
print(X.head())

             LotArea    YearBuilt     1stFlrSF     2ndFlrSF     FullBath  \
count    1460.000000  1460.000000  1460.000000  1460.000000  1460.000000   
mean    10516.828082  1971.267808  1162.626712   346.992466     1.565068   
std      9981.264932    30.202904   386.587738   436.528436     0.550916   
min      1300.000000  1872.000000   334.000000     0.000000     0.000000   
25%      7553.500000  1954.000000   882.000000     0.000000     1.000000   
50%      9478.500000  1973.000000  1087.000000     0.000000     2.000000   
75%     11601.500000  2000.000000  1391.250000   728.000000     2.000000   
max    215245.000000  2010.000000  4692.000000  2065.000000     3.000000   

       BedroomAbvGr  TotRmsAbvGrd  
count   1460.000000   1460.000000  
mean       2.866438      6.517808  
std        0.815778      1.625393  
min        0.000000      2.000000  
25%        2.000000      5.000000  
50%        3.000000      6.000000  
75%        3.000000      7.000000  
max        8.000000     14.000000  
   LotArea  YearBuilt  1stFlrSF  2ndFlrSF  FullBath  BedroomAbvGr  \
0     8450       2003       856       854         2             3   
1     9600       1976      1262         0         2             3   
2    11250       2001       920       866         2             3   
3     9550       1915       961       756         1             3   
4    14260       2000      1145      1053         2             4   

   TotRmsAbvGrd  
0             8  
1             6  
2             6  
3             7  
4             9

Step 3: Specify and Fit Model

from sklearn.tree import DecisionTreeRegressor

home_model = DecisionTreeRegressor(random_state=1)
home_model.fit(X, y)
#learned pattern of the data

#check whether the prediction is correct with 5 rows of the data
print(f'Making predictions for the following 5 houses: {X.head()}')
print(f'The real answers are\n {y.head()}')
print(f'The predictions are\n {home_model.predict(X.head())}')
#seems correct

Making predictions for the following 5 houses:    LotArea  YearBuilt  1stFlrSF  2ndFlrSF  FullBath  BedroomAbvGr  \
0     8450       2003       856       854         2             3   
1     9600       1976      1262         0         2             3   
2    11250       2001       920       866         2             3   
3     9550       1915       961       756         1             3   
4    14260       2000      1145      1053         2             4   

   TotRmsAbvGrd  
0             8  
1             6  
2             6  
3             7  
4             9  
The real answers are
 0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64
The predictions are
 [208500. 181500. 223500. 140000. 250000.]

Step 4: Make Predictions

prediction = home_model.predict(X)
print(y[:10])
print(prediction[:10])

0    208500
1    181500
2    223500
3    140000
4    250000
5    143000
6    307000
7    200000
8    129900
9    118000
Name: SalePrice, dtype: int64
[208500. 181500. 223500. 140000. 250000. 143000. 307000. 200000. 129900.
 118000.]

Model Validation
this includes the steps of validating the model with spliting data into two groups.

Step 1: Split Your Data
Step 2: Specify and Fit the Model
Step 3: Make Predictions with Validation data

from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

home_model.fit(train_X, train_y)

val_predictions = home_model.predict(val_X)

print(f'val_prediction : \n{val_predictions[:10]}')
print(f'val_y : \n{val_y[:10]}')

val_prediction : 
[186500. 184000. 130000.  92000. 164500. 220000. 335000. 144152. 215000.
 262000.]
val_y : 
258     231500
267     179500
288     122000
649      84500
1233    142000
167     325624
926     285000
831     151000
1237    195000
426     275000
Name: SalePrice, dtype: int64

Step 4: Calculate the Mean Absolute Error in Validation Data

from sklearn.metrics import mean_absolute_error

val_mae = mean_absolute_error(val_predictions, val_y)
print(f'val_mae: {val_mae}')

val_mae: 29652.931506849316

Underfitting and Overfitting
creating function 'def' to check underfitting & overfitting
-> train several tree models which has different size of leaf node. after that, check each model's mae value and find the optimal sized tree model.

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
  model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=1)
  model.fit(train_X, train_y)
  val_preds = model.predict(val_X)
  mae = mean_absolute_error(val_preds, val_y)
  return mae

max_leaf_nodes = [5, 25, 50, 100, 250, 500]

scores = {leaf_size : get_mae(leaf_size, train_X, val_X, train_y, val_y) for leaf_size in max_leaf_nodes}
print(scores)

best_tree_size = min(scores, key=scores.get)
print(f'best_tree_size : {best_tree_size}')

{5: 35044.51299744237, 25: 29016.41319191076, 50: 27405.930473214907, 100: 27282.50803885739, 250: 27430.850744944964, 500: 28380.917944156296}
best_tree_size : 100

á„‰á…³á„á…³á„…á…µá†«á„‰á…£á†º 2022-11-05 18.23.34.png

Step 2: Fit Model Using All Data

final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=1)
final_model.fit(X, y)

DecisionTreeRegressor(max_leaf_nodes=100, random_state=1)

Random Forests

from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(random_state=1)
rf_model.fit(train_X, train_y)

rf_val_preds = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(rf_val_preds, val_y)

print("Validation MAE for Random Forest Model: {}".format(rf_val_mae))

print(scores[best_tree_size])

Validation MAE for Random Forest Model: 21857.15912981083
27282.50803885739

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

!jupyter nbconvert --to html "/content/drive/MyDrive/Kaggle/project0/kaggle_learn_ml.ipynb"

[NbConvertApp] Converting notebook /content/drive/MyDrive/Kaggle/project0/kaggle_learn_ml.ipynb to html
[NbConvertApp] Writing 318585 bytes to /content/drive/MyDrive/Kaggle/project0/kaggle_learn_ml.html

[kaggle] Learn Tutorial_Intro to Machine Learning (정리)_2 (0)	2022.11.04
[kaggle] Learn Tutorial_Intro to Machine Learning (정리)_1 (0)	2022.11.03
[kaggle] Learn Tutorial_Intro to Machine Learning (정리)_0 (0)	2022.11.02

yoooniverse

yoooniverse

[kaggle] Learn Tutorial_Intro to Machine Learning (정리)_3(실습코드) 본문

[kaggle] Learn Tutorial_Intro to Machine Learning (정리)_3(실습코드)

'KAGGLE > Intro to Machine Learning' 카테고리의 다른 글

티스토리툴바

	Id	MSSubClass	LotFrontage	LotArea	OverallQual	OverallCond	YearBuilt	YearRemodAdd	MasVnrArea	BsmtFinSF1	...	WoodDeckSF	OpenPorchSF	EnclosedPorch	3SsnPorch	ScreenPorch	PoolArea	MiscVal	MoSold	YrSold	SalePrice
count	1460.000000	1460.000000	1201.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1452.000000	1460.000000	...	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000	1460.000000
mean	730.500000	56.897260	70.049958	10516.828082	6.099315	5.575342	1971.267808	1984.865753	103.685262	443.639726	...	94.244521	46.660274	21.954110	3.409589	15.060959	2.758904	43.489041	6.321918	2007.815753	180921.195890
std	421.610009	42.300571	24.284752	9981.264932	1.382997	1.112799	30.202904	20.645407	181.066207	456.098091	...	125.338794	66.256028	61.119149	29.317331	55.757415	40.177307	496.123024	2.703626	1.328095	79442.502883
min	1.000000	20.000000	21.000000	1300.000000	1.000000	1.000000	1872.000000	1950.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	2006.000000	34900.000000
25%	365.750000	20.000000	59.000000	7553.500000	5.000000	5.000000	1954.000000	1967.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	5.000000	2007.000000	129975.000000
50%	730.500000	50.000000	69.000000	9478.500000	6.000000	5.000000	1973.000000	1994.000000	0.000000	383.500000	...	0.000000	25.000000	0.000000	0.000000	0.000000	0.000000	0.000000	6.000000	2008.000000	163000.000000
75%	1095.250000	70.000000	80.000000	11601.500000	7.000000	6.000000	2000.000000	2004.000000	166.000000	712.250000	...	168.000000	68.000000	0.000000	0.000000	0.000000	0.000000	0.000000	8.000000	2009.000000	214000.000000
max	1460.000000	190.000000	313.000000	215245.000000	10.000000	9.000000	2010.000000	2010.000000	1600.000000	5644.000000	...	857.000000	547.000000	552.000000	508.000000	480.000000	738.000000	15500.000000	12.000000	2010.000000	755000.000000

« 2025/07 »
일	월	화	수	목	금	토
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30	31