import sys, os
sys.path.append(os.path.abspath(".."))
from src.utils import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoderSplitting Input and output
df_train = load_data("../data/raw/train.csv")Data Shape: (188533, 12)
df_test = load_data("../data/raw/test.csv")Data Shape: (125690, 11)
X_train, y_train = split_input_output(df_train, 'price')Original data shape: (188533, 12)
X data shape: (188533, 11)
y datashape: (188533,)
X_train.info()<class 'pandas.core.frame.DataFrame'>
Index: 188533 entries, 0 to 188532
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 brand 188533 non-null object
1 model 188533 non-null object
2 model_year 188533 non-null int64
3 milage 188533 non-null int64
4 fuel_type 183450 non-null object
5 engine 188533 non-null object
6 transmission 188533 non-null object
7 ext_col 188533 non-null object
8 int_col 188533 non-null object
9 accident 186081 non-null object
10 clean_title 167114 non-null object
dtypes: int64(2), object(9)
memory usage: 17.3+ MB
Handling Inconsistent format: fuel_type and transmission
def format_cleaning(X):
X = X.copy()
#Handling Inconsistencies in 'fuel_type'
X['fuel_type'] = X['fuel_type'].replace({'-': 'Unknown','not supported': 'Other', np.nan: 'Unknown'})
#Handling Inconsistencies in 'Transmission'
trans_series = X['transmission'].astype(str).str.lower().str.strip()
X['transmission'] = "Unknown"
X.loc[trans_series.str.contains(r'(?:a\/t|at|automatic|auto[-\s]?shift|cvt|dct|\d+\s*[-]?\s*speed\s*(?:a\/t|at|automatic))', regex=True, na=False), 'transmission'] = "a/t"
X.loc[trans_series.str.contains(r'(?:m\/t|mt|manual|\d+\s*[-]?\s*speed\s*(?:m\/t|mt|manual))', regex=True, na=False), 'transmission'] = "m/t"
return XX_train_format_cleaning = format_cleaning(X_train)X_train_format_cleaning.head()| brand | model | model_year | milage | fuel_type | engine | transmission | ext_col | int_col | accident | clean_title | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| id | |||||||||||
| 0 | MINI | Cooper S Base | 2007 | 213000 | Gasoline | 172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel | a/t | Yellow | Gray | None reported | Yes |
| 1 | Lincoln | LS V8 | 2002 | 143250 | Gasoline | 252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel | a/t | Silver | Beige | At least 1 accident or damage reported | Yes |
| 2 | Chevrolet | Silverado 2500 LT | 2002 | 136731 | E85 Flex Fuel | 320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab... | a/t | Blue | Gray | None reported | Yes |
| 3 | Genesis | G90 5.0 Ultimate | 2017 | 19500 | Gasoline | 420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel | Unknown | Black | Black | None reported | Yes |
| 4 | Mercedes-Benz | Metris Base | 2021 | 7388 | Gasoline | 208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel | a/t | Black | Beige | None reported | Yes |
Extracting Information from engine
def extract_fuel(val):
s = str(val).lower()
if "hydrogen"in s:
return "Hydrogen"
elif "electric" in s:
return "Electric"
elif "plug-in hybrid" in s:
return "Plug-In Hybrid"
elif "hybrid" in s:
return "Hybrid"
elif "diesel" in s:
return "Diesel"
elif "gasoline" in s:
return "Gasoline"
elif "flexible fuel" in s or "flex fuel" in s:
return "Flexible Fuel"
else:
return "Unknown"
def extract_horsepower(val):
if not isinstance(val, str):
return np.nan
match = re.search(r'([\d.]+)\s*HP', val, re.IGNORECASE)
return float(match.group(1)) if match else np.nan
def extract_engine_size(val):
if not isinstance(val, str):
return np.nan
match = re.search(r'([\d.]+)\s*L', val, re.IGNORECASE)
return float(match.group(1)) if match else np.nan
def extract_cylinder(val):
if not isinstance(val, str):
return np.nan
match = re.search(r'\bV?(\d+)\s*(Cylinder|V\d|I\d|Rotary)', val, re.IGNORECASE)
return int(match.group(1)) if match else np.nan
def extract_is_electric(val):
if not isinstance(val, str):
return 0
return 1 if re.search(r'electric\s+motor', val, re.IGNORECASE) else 0
def extract_is_turbo(val):
if not isinstance(val, str):
return 0
return 1 if re.search(r'turbo|supercharged', val, re.IGNORECASE) else 0
def extract_fuel_system(val):
if not isinstance(val, str):
return np.nan
match = re.search(r'([A-Za-z\s]+Fuel\s*System|[A-Za-z\s]+Fuel)$', val.strip(), re.IGNORECASE)
return match.group(1).strip() if match else "Unknown"def extract_info(X, engine_col="engine"):
X = X.copy()
specs_df = pd.DataFrame({
"horsepower": X[engine_col].apply(extract_horsepower),
"engine_size": X[engine_col].apply(extract_engine_size),
"cylinder": X[engine_col].apply(extract_cylinder),
"is_electric": X[engine_col].apply(extract_is_electric),
"is_turbo": X[engine_col].apply(extract_is_turbo),
"fuel_system": X[engine_col].apply(extract_fuel_system),
})
fuel = X['engine'].apply(extract_fuel)
X['fuel_type'] = X['fuel_type'].mask(X['fuel_type'] == "Unknown", fuel)
# Set 0 for EV if hp/cylinder/size is NaN
mask_ev = specs_df["is_electric"] == 1
specs_df.loc[mask_ev & specs_df["horsepower"].isna(), "horsepower"] = 0
specs_df.loc[mask_ev & specs_df["cylinder"].isna(), "cylinder"] = 0
specs_df.loc[mask_ev & specs_df["engine_size"].isna(), "engine_size"] = 0
X.drop(['engine','model'], axis = 1,inplace=True)
return pd.concat([X, specs_df], axis=1)X_train_extracted = extract_info(X_train_format_cleaning)Handling Missing Values: accident and clean_title
X_train_extracted.isna().sum()brand 0
model_year 0
milage 0
fuel_type 0
transmission 0
ext_col 0
int_col 0
accident 2452
clean_title 21419
horsepower 32975
engine_size 1787
cylinder 32875
is_electric 0
is_turbo 0
fuel_system 0
dtype: int64
#impute horsepower, engine_size, cylinder columns for non-electric cars
def imputer_fit(df):
df_non_ev = df[df['is_electric']==0]
cols =["horsepower", "engine_size", "cylinder"]
imp = SimpleImputer(missing_values=np.nan, strategy='median')
imp.fit(df_non_ev[cols])
return imp
def imputer_transform(data ,imp):
data = data.copy()
# Non-electric impute with median
cols=["horsepower", "engine_size", "cylinder"]
mask_non_ev = data['is_electric'] == 0
data.loc[mask_non_ev, cols] = imp.transform(data.loc[mask_non_ev, cols])
# Impute accidend and clean_title
data['accident'] = data['accident'].fillna('Unknown')
data['clean_title'] = data['clean_title'].fillna('Unknown')
return dataimp = imputer_fit(X_train_extracted)
X_train_imputed = imputer_transform(X_train_extracted, imp)serialize_data(imp, path='../models/imputer.pkl')X_train_imputed.isna().sum()brand 0
model_year 0
milage 0
fuel_type 0
transmission 0
ext_col 0
int_col 0
accident 0
clean_title 0
horsepower 0
engine_size 0
cylinder 0
is_electric 0
is_turbo 0
fuel_system 0
dtype: int64
Simplifying Color
def simplify_color(color):
"""
Menyederhanakan kategori warna menjadi:
blue, red, black, silver, white, gold, orange, purple, beige, other, unknown
"""
if not isinstance(color, str) or color.strip() == "" or color.strip() =="-":
return "Unknown"
color_lower = color.lower()
mapping = {
"blue": ["blue", "navy", "aqua", "turquoise", "teal"],
"red": ["red", "maroon", "burgundy"],
"black": ["black", "ebony", "onyx"],
"silver": ["silver", "gray", "grey", "graphite", "charcoal"],
"white": ["white", "ivory", "cream", "pearl"],
"gold": ["gold", "champagne"],
"orange": ["orange", "copper", "bronze"],
"purple": ["purple", "violet", "plum", "lavender"],
"beige": ["beige", "tan", "sand", "khaki", "camel"]
}
for base_color, keywords in mapping.items():
if any(kw in color_lower for kw in keywords):
return base_color
return "Other"
def color_transform(X):
X["ext_col"] = X["ext_col"].apply(simplify_color)
X["int_col"] = X["int_col"].apply(simplify_color)
return XX_train_color_simplified = color_transform(X_train_imputed)X_train_color_simplified['int_col'].nunique()9
X_train_color_simplified['ext_col'].nunique()10
Transformation: Standard Scaling
X_train_color_simplified.info()<class 'pandas.core.frame.DataFrame'>
Index: 188533 entries, 0 to 188532
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 brand 188533 non-null object
1 model_year 188533 non-null int64
2 milage 188533 non-null int64
3 fuel_type 188533 non-null object
4 transmission 188533 non-null object
5 ext_col 188533 non-null object
6 int_col 188533 non-null object
7 accident 188533 non-null object
8 clean_title 188533 non-null object
9 horsepower 188533 non-null float64
10 engine_size 188533 non-null float64
11 cylinder 188533 non-null float64
12 is_electric 188533 non-null int64
13 is_turbo 188533 non-null int64
14 fuel_system 188533 non-null object
dtypes: float64(3), int64(4), object(8)
memory usage: 23.0+ MB
def scaler_fit(df):
cols = ['model_year','milage','horsepower','engine_size','cylinder']
df[cols] = df[cols].astype(float)
scaler = StandardScaler()
scaler.fit(df[cols])
return scaler
def scaler_transform(df, scaler):
cols = ['model_year','milage','horsepower','engine_size','cylinder']
df[cols] = df[cols].astype(float)
df.loc[:, cols] = scaler.transform(df[cols]).astype(float)
return dfscaler = scaler_fit(X_train_color_simplified)
X_train_scaled = scaler_transform(X_train_color_simplified, scaler)serialize_data(scaler, path='../models/scaler.pkl')Encoding Categorical Variables
X_train_scaled.info()<class 'pandas.core.frame.DataFrame'>
Index: 188533 entries, 0 to 188532
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 brand 188533 non-null object
1 model_year 188533 non-null float64
2 milage 188533 non-null float64
3 fuel_type 188533 non-null object
4 transmission 188533 non-null object
5 ext_col 188533 non-null object
6 int_col 188533 non-null object
7 accident 188533 non-null object
8 clean_title 188533 non-null object
9 horsepower 188533 non-null float64
10 engine_size 188533 non-null float64
11 cylinder 188533 non-null float64
12 is_electric 188533 non-null int64
13 is_turbo 188533 non-null int64
14 fuel_system 188533 non-null object
dtypes: float64(5), int64(2), object(8)
memory usage: 23.0+ MB
cat_cols = X_train_scaled.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = X_train_scaled.select_dtypes(include=["float","int"]).columns.tolist()
display(cat_cols)
display(num_cols)['brand',
'fuel_type',
'transmission',
'ext_col',
'int_col',
'accident',
'clean_title',
'fuel_system']
['model_year',
'milage',
'horsepower',
'engine_size',
'cylinder',
'is_electric',
'is_turbo']
One-Hot Encoder
def OHencoder_fit(X):
cat_cols = ['brand','fuel_type','transmission','ext_col','int_col','accident', 'clean_title','fuel_system']
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore',).set_output(transform="pandas")
ohe.fit(X[cat_cols])
return ohe
def encoder_transform(X, encoder):
cat_cols = ['brand','fuel_type','transmission','ext_col','int_col','accident', 'clean_title','fuel_system']
encoded = encoder.transform(X[cat_cols])
df_encoded = pd.concat([X.drop(columns=cat_cols), encoded], axis=1)
return df_encodedOHencoder = OHencoder_fit(X_train_scaled)serialize_data(OHencoder, path='../models/OHencoder.pkl')Preprocess Pipeline
def preprocessing_pipeline(X, imputer, scaler, encoder):
X_clean = X.copy()
X_clean = format_cleaning(X_clean)
X_clean = extract_info(X_clean)
X_clean = imputer_transform(X_clean, imputer)
X_clean = color_transform(X_clean)
X_clean = scaler_transform(X_clean, scaler)
X_clean = encoder_transform(X_clean, encoder)
return X_cleanX_train_clean = preprocessing_pipeline(X_train, imp, scaler, OHencoder)