Splitting Input and output

import sys, os
sys.path.append(os.path.abspath(".."))
from src.utils import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

df_train = load_data("../data/raw/train.csv")

Data Shape:  (188533, 12)

df_test = load_data("../data/raw/test.csv")

Data Shape:  (125690, 11)

X_train, y_train = split_input_output(df_train, 'price')

Original data shape: (188533, 12)
X data shape: (188533, 11)
y datashape: (188533,)

X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 188533 entries, 0 to 188532
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   brand         188533 non-null  object
 1   model         188533 non-null  object
 2   model_year    188533 non-null  int64 
 3   milage        188533 non-null  int64 
 4   fuel_type     183450 non-null  object
 5   engine        188533 non-null  object
 6   transmission  188533 non-null  object
 7   ext_col       188533 non-null  object
 8   int_col       188533 non-null  object
 9   accident      186081 non-null  object
 10  clean_title   167114 non-null  object
dtypes: int64(2), object(9)
memory usage: 17.3+ MB

Handling Inconsistent format: fuel_type and transmission

def format_cleaning(X):
  X = X.copy()

  #Handling Inconsistencies in 'fuel_type'
  X['fuel_type'] = X['fuel_type'].replace({'-': 'Unknown','not supported': 'Other', np.nan: 'Unknown'})

  #Handling Inconsistencies in 'Transmission'
  trans_series = X['transmission'].astype(str).str.lower().str.strip()
  X['transmission'] = "Unknown"
  X.loc[trans_series.str.contains(r'(?:a\/t|at|automatic|auto[-\s]?shift|cvt|dct|\d+\s*[-]?\s*speed\s*(?:a\/t|at|automatic))', regex=True, na=False), 'transmission'] = "a/t"
  X.loc[trans_series.str.contains(r'(?:m\/t|mt|manual|\d+\s*[-]?\s*speed\s*(?:m\/t|mt|manual))', regex=True, na=False), 'transmission'] = "m/t"
  return X

X_train_format_cleaning = format_cleaning(X_train)

X_train_format_cleaning.head()

	brand	model	model_year	milage	fuel_type	engine	transmission	ext_col	int_col	accident	clean_title
id
0	MINI	Cooper S Base	2007	213000	Gasoline	172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel	a/t	Yellow	Gray	None reported	Yes
1	Lincoln	LS V8	2002	143250	Gasoline	252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel	a/t	Silver	Beige	At least 1 accident or damage reported	Yes
2	Chevrolet	Silverado 2500 LT	2002	136731	E85 Flex Fuel	320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...	a/t	Blue	Gray	None reported	Yes
3	Genesis	G90 5.0 Ultimate	2017	19500	Gasoline	420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel	Unknown	Black	Black	None reported	Yes
4	Mercedes-Benz	Metris Base	2021	7388	Gasoline	208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel	a/t	Black	Beige	None reported	Yes

Extracting Information from `engine`

def extract_fuel(val):
    s = str(val).lower()

    if "hydrogen"in s:
        return "Hydrogen"
    elif "electric" in s:
        return "Electric"
    elif "plug-in hybrid" in s:
        return "Plug-In Hybrid"
    elif "hybrid" in s:
        return "Hybrid"
    elif "diesel" in s:
        return "Diesel"
    elif "gasoline" in s:
        return "Gasoline"
    elif "flexible fuel" in s or "flex fuel" in s:
        return "Flexible Fuel"
    else:
        return "Unknown"

def extract_horsepower(val):
    if not isinstance(val, str):
        return np.nan
    match = re.search(r'([\d.]+)\s*HP', val, re.IGNORECASE)
    return float(match.group(1)) if match else np.nan

def extract_engine_size(val):
    if not isinstance(val, str):
        return np.nan
    match = re.search(r'([\d.]+)\s*L', val, re.IGNORECASE)
    return float(match.group(1)) if match else np.nan

def extract_cylinder(val):
    if not isinstance(val, str):
        return np.nan
    match = re.search(r'\bV?(\d+)\s*(Cylinder|V\d|I\d|Rotary)', val, re.IGNORECASE)
    return int(match.group(1)) if match else np.nan

def extract_is_electric(val):
    if not isinstance(val, str):
        return 0
    return 1 if re.search(r'electric\s+motor', val, re.IGNORECASE) else 0

def extract_is_turbo(val):
    if not isinstance(val, str):
        return 0
    return 1 if re.search(r'turbo|supercharged', val, re.IGNORECASE) else 0

def extract_fuel_system(val):
    if not isinstance(val, str):
        return np.nan
    match = re.search(r'([A-Za-z\s]+Fuel\s*System|[A-Za-z\s]+Fuel)$', val.strip(), re.IGNORECASE)
    return match.group(1).strip() if match else "Unknown"

def extract_info(X, engine_col="engine"):
  X = X.copy()
  specs_df = pd.DataFrame({
        "horsepower": X[engine_col].apply(extract_horsepower),
        "engine_size": X[engine_col].apply(extract_engine_size),
        "cylinder": X[engine_col].apply(extract_cylinder),
        "is_electric": X[engine_col].apply(extract_is_electric),
        "is_turbo": X[engine_col].apply(extract_is_turbo),
        "fuel_system": X[engine_col].apply(extract_fuel_system),
    })

  fuel = X['engine'].apply(extract_fuel)
  X['fuel_type'] = X['fuel_type'].mask(X['fuel_type'] == "Unknown", fuel)

    # Set 0 for EV if hp/cylinder/size is NaN
  mask_ev = specs_df["is_electric"] == 1
  specs_df.loc[mask_ev & specs_df["horsepower"].isna(), "horsepower"] = 0
  specs_df.loc[mask_ev & specs_df["cylinder"].isna(), "cylinder"] = 0
  specs_df.loc[mask_ev & specs_df["engine_size"].isna(), "engine_size"] = 0

  X.drop(['engine','model'], axis = 1,inplace=True)

  return pd.concat([X, specs_df], axis=1)

X_train_extracted = extract_info(X_train_format_cleaning)

Handling Missing Values: `accident` and `clean_title`

X_train_extracted.isna().sum()

brand               0
model_year          0
milage              0
fuel_type           0
transmission        0
ext_col             0
int_col             0
accident         2452
clean_title     21419
horsepower      32975
engine_size      1787
cylinder        32875
is_electric         0
is_turbo            0
fuel_system         0
dtype: int64

#impute horsepower, engine_size, cylinder columns for non-electric cars
def imputer_fit(df):
  df_non_ev = df[df['is_electric']==0]
  cols =["horsepower", "engine_size", "cylinder"]
  imp = SimpleImputer(missing_values=np.nan, strategy='median')
  imp.fit(df_non_ev[cols])
  return imp

def imputer_transform(data ,imp):
  data = data.copy()
  # Non-electric impute with median
  cols=["horsepower", "engine_size", "cylinder"]
  mask_non_ev = data['is_electric'] == 0
  data.loc[mask_non_ev, cols] = imp.transform(data.loc[mask_non_ev, cols])

  # Impute accidend and clean_title
  data['accident'] = data['accident'].fillna('Unknown')
  data['clean_title'] = data['clean_title'].fillna('Unknown')

  return data

imp = imputer_fit(X_train_extracted)

X_train_imputed = imputer_transform(X_train_extracted, imp)

serialize_data(imp, path='../models/imputer.pkl')

X_train_imputed.isna().sum()

brand           0
model_year      0
milage          0
fuel_type       0
transmission    0
ext_col         0
int_col         0
accident        0
clean_title     0
horsepower      0
engine_size     0
cylinder        0
is_electric     0
is_turbo        0
fuel_system     0
dtype: int64

Simplifying Color

def simplify_color(color):
    """
    Menyederhanakan kategori warna menjadi:
    blue, red, black, silver, white, gold, orange, purple, beige, other, unknown
    """
    if not isinstance(color, str) or color.strip() == "" or color.strip() =="-":
        return "Unknown"

    color_lower = color.lower()

    mapping = {
        "blue": ["blue", "navy", "aqua", "turquoise", "teal"],
        "red": ["red", "maroon", "burgundy"],
        "black": ["black", "ebony", "onyx"],
        "silver": ["silver", "gray", "grey", "graphite", "charcoal"],
        "white": ["white", "ivory", "cream", "pearl"],
        "gold": ["gold", "champagne"],
        "orange": ["orange", "copper", "bronze"],
        "purple": ["purple", "violet", "plum", "lavender"],
        "beige": ["beige", "tan", "sand", "khaki", "camel"]
    }

    for base_color, keywords in mapping.items():
        if any(kw in color_lower for kw in keywords):
            return base_color

    return "Other"

def color_transform(X):
  X["ext_col"] = X["ext_col"].apply(simplify_color)
  X["int_col"] = X["int_col"].apply(simplify_color)
  return X

X_train_color_simplified = color_transform(X_train_imputed)

X_train_color_simplified['int_col'].nunique()

X_train_color_simplified['ext_col'].nunique()

Transformation: Standard Scaling

X_train_color_simplified.info()

<class 'pandas.core.frame.DataFrame'>
Index: 188533 entries, 0 to 188532
Data columns (total 15 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   brand         188533 non-null  object 
 1   model_year    188533 non-null  int64  
 2   milage        188533 non-null  int64  
 3   fuel_type     188533 non-null  object 
 4   transmission  188533 non-null  object 
 5   ext_col       188533 non-null  object 
 6   int_col       188533 non-null  object 
 7   accident      188533 non-null  object 
 8   clean_title   188533 non-null  object 
 9   horsepower    188533 non-null  float64
 10  engine_size   188533 non-null  float64
 11  cylinder      188533 non-null  float64
 12  is_electric   188533 non-null  int64  
 13  is_turbo      188533 non-null  int64  
 14  fuel_system   188533 non-null  object 
dtypes: float64(3), int64(4), object(8)
memory usage: 23.0+ MB

def scaler_fit(df):
  cols = ['model_year','milage','horsepower','engine_size','cylinder']
  df[cols] = df[cols].astype(float)
  scaler = StandardScaler()
  scaler.fit(df[cols])
  return scaler

def scaler_transform(df, scaler):
  cols = ['model_year','milage','horsepower','engine_size','cylinder']
  df[cols] = df[cols].astype(float)
  df.loc[:, cols] = scaler.transform(df[cols]).astype(float)
  return df

scaler = scaler_fit(X_train_color_simplified)
X_train_scaled = scaler_transform(X_train_color_simplified, scaler)

serialize_data(scaler, path='../models/scaler.pkl')

Encoding Categorical Variables

X_train_scaled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 188533 entries, 0 to 188532
Data columns (total 15 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   brand         188533 non-null  object 
 1   model_year    188533 non-null  float64
 2   milage        188533 non-null  float64
 3   fuel_type     188533 non-null  object 
 4   transmission  188533 non-null  object 
 5   ext_col       188533 non-null  object 
 6   int_col       188533 non-null  object 
 7   accident      188533 non-null  object 
 8   clean_title   188533 non-null  object 
 9   horsepower    188533 non-null  float64
 10  engine_size   188533 non-null  float64
 11  cylinder      188533 non-null  float64
 12  is_electric   188533 non-null  int64  
 13  is_turbo      188533 non-null  int64  
 14  fuel_system   188533 non-null  object 
dtypes: float64(5), int64(2), object(8)
memory usage: 23.0+ MB

cat_cols = X_train_scaled.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = X_train_scaled.select_dtypes(include=["float","int"]).columns.tolist()
display(cat_cols)
display(num_cols)

['brand',
 'fuel_type',
 'transmission',
 'ext_col',
 'int_col',
 'accident',
 'clean_title',
 'fuel_system']

['model_year',
 'milage',
 'horsepower',
 'engine_size',
 'cylinder',
 'is_electric',
 'is_turbo']

One-Hot Encoder

def OHencoder_fit(X):
  cat_cols = ['brand','fuel_type','transmission','ext_col','int_col','accident', 'clean_title','fuel_system']
  ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore',).set_output(transform="pandas")
  ohe.fit(X[cat_cols])
  return ohe

def encoder_transform(X, encoder):
  cat_cols = ['brand','fuel_type','transmission','ext_col','int_col','accident', 'clean_title','fuel_system']
  encoded = encoder.transform(X[cat_cols])
  df_encoded = pd.concat([X.drop(columns=cat_cols), encoded], axis=1)
  return df_encoded

OHencoder = OHencoder_fit(X_train_scaled)

serialize_data(OHencoder, path='../models/OHencoder.pkl')

Preprocess Pipeline

def preprocessing_pipeline(X, imputer, scaler, encoder):
  X_clean = X.copy()
  X_clean = format_cleaning(X_clean)
  X_clean = extract_info(X_clean)
  X_clean = imputer_transform(X_clean, imputer)
  X_clean = color_transform(X_clean)
  X_clean = scaler_transform(X_clean, scaler)
  X_clean = encoder_transform(X_clean, encoder)
  return X_clean

X_train_clean = preprocessing_pipeline(X_train, imp, scaler, OHencoder)

Handling Inconsistent format: fuel_type and transmission

Extracting Information from engine

Handling Missing Values: accident and clean_title

Simplifying Color

Transformation: Standard Scaling

Encoding Categorical Variables

One-Hot Encoder

Preprocess Pipeline

Extracting Information from `engine`

Handling Missing Values: `accident` and `clean_title`